In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('brisbane_water_quality.csv')

In [39]:
# Drop columns with quality information
df = df.drop(columns=['Dissolved Oxygen [quality]', 'Chlorophyll [quality]', 'Temperature [quality]',
                      'Dissolved Oxygen (%Saturation) [quality]', 'pH [quality]', 'Salinity [quality]',
                      'Specific Conductance [quality]', 'Turbidity [quality]', 'Record number'], errors='ignore')

In [40]:
df.shape

(30894, 11)

In [41]:
# Convert 'Timestamp' column to datetime 
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [42]:
df.head

<bound method NDFrame.head of                 Timestamp  Average Water Speed  Average Water Direction  \
0     2023-08-04 23:00:00                4.834                   73.484   
1     2023-08-04 23:30:00                2.544                  106.424   
2     2023-08-04 23:00:00                1.260                  156.755   
3     2023-08-04 23:30:00                0.760                  281.754   
4     2023-08-04 23:00:00                3.397                  244.637   
...                   ...                  ...                      ...   
30889 2024-06-27 08:20:00               13.314                   82.720   
30890 2024-06-27 08:30:00               32.617                   18.081   
30891 2024-06-27 08:40:00                8.552                  306.184   
30892 2024-06-27 08:50:00               10.341                   24.711   
30893 2024-06-27 09:00:00               11.173                  241.662   

       Chlorophyll  Temperature  Dissolved Oxygen  \
0            1.6

In [43]:
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfYear'] = df['Timestamp'].dt.dayofyear
df['Month'] = df['Timestamp'].dt.month

df.head()

Unnamed: 0,Timestamp,Average Water Speed,Average Water Direction,Chlorophyll,Temperature,Dissolved Oxygen,Dissolved Oxygen (%Saturation),pH,Salinity,Specific Conductance,Turbidity,Hour,DayOfYear,Month
0,2023-08-04 23:00:00,4.834,73.484,1.621,20.018,7.472,101.175,8.176,35.215,53.262,2.068,23,216,8
1,2023-08-04 23:30:00,2.544,106.424,1.959,19.986,7.455,100.884,8.175,35.209,53.254,1.994,23,216,8
2,2023-08-04 23:00:00,1.26,156.755,1.62,20.001,7.43,100.571,8.171,35.207,53.252,2.03,23,216,8
3,2023-08-04 23:30:00,0.76,281.754,1.761,19.983,7.419,100.398,8.171,35.211,53.257,1.973,23,216,8
4,2023-08-04 23:00:00,3.397,244.637,1.635,19.986,7.429,100.538,8.171,35.208,53.253,1.944,23,216,8


In [44]:
duplicate_timestamps = df[df.duplicated(subset=['Timestamp'], keep=False)]
if not duplicate_timestamps.empty:
    print("Duplicate timestamps exist:")
    print(duplicate_timestamps)
else:
    print("No duplicate timestamps found.")

Duplicate timestamps exist:
                Timestamp  Average Water Speed  Average Water Direction  \
0     2023-08-04 23:00:00                4.834                   73.484   
1     2023-08-04 23:30:00                2.544                  106.424   
2     2023-08-04 23:00:00                1.260                  156.755   
3     2023-08-04 23:30:00                0.760                  281.754   
4     2023-08-04 23:00:00                3.397                  244.637   
...                   ...                  ...                      ...   
30459 2024-06-24 09:00:00               23.286                  102.621   
30603 2024-06-25 09:00:00                5.070                   29.750   
30604 2024-06-25 09:00:00                5.070                   29.750   
30748 2024-06-26 09:00:00               42.538                  212.200   
30749 2024-06-26 09:00:00               42.538                  212.200   

       Chlorophyll  Temperature  Dissolved Oxygen  \
0            1.621

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30894 entries, 0 to 30893
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Timestamp                       30894 non-null  datetime64[ns]
 1   Average Water Speed             30874 non-null  float64       
 2   Average Water Direction         30893 non-null  float64       
 3   Chlorophyll                     30309 non-null  float64       
 4   Temperature                     25730 non-null  float64       
 5   Dissolved Oxygen                26594 non-null  float64       
 6   Dissolved Oxygen (%Saturation)  25145 non-null  float64       
 7   pH                              29810 non-null  float64       
 8   Salinity                        26936 non-null  float64       
 9   Specific Conductance            29527 non-null  float64       
 10  Turbidity                       28894 non-null  float64       
 11  Ho

In [46]:
# Remove duplicate timestamps by averaging values for duplicate timestamps
df.set_index('Timestamp', inplace=True)
df = df.groupby(df.index).mean()

df.shape

(30614, 13)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 30614 entries, 2023-08-04 23:00:00 to 2024-06-27 09:00:00
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Average Water Speed             30594 non-null  float64
 1   Average Water Direction         30613 non-null  float64
 2   Chlorophyll                     30041 non-null  float64
 3   Temperature                     25491 non-null  float64
 4   Dissolved Oxygen                26370 non-null  float64
 5   Dissolved Oxygen (%Saturation)  24938 non-null  float64
 6   pH                              29546 non-null  float64
 7   Salinity                        26705 non-null  float64
 8   Specific Conductance            29266 non-null  float64
 9   Turbidity                       28654 non-null  float64
 10  Hour                            30614 non-null  float64
 11  DayOfYear                       30614 non-null  float64
 1

In [48]:
# Check for negative values in each column
negative_counts = (df < 0).sum()

print("Columns with negative values:")
print(negative_counts)


Columns with negative values:
Average Water Speed               0
Average Water Direction           0
Chlorophyll                       0
Temperature                       0
Dissolved Oxygen                  0
Dissolved Oxygen (%Saturation)    0
pH                                0
Salinity                          0
Specific Conductance              0
Turbidity                         0
Hour                              0
DayOfYear                         0
Month                             0
dtype: int64


In [49]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median') 

# Fit the imputer on the DataFrame's numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

# Check for missing values after imputation
df.isnull().sum()

Average Water Speed               0
Average Water Direction           0
Chlorophyll                       0
Temperature                       0
Dissolved Oxygen                  0
Dissolved Oxygen (%Saturation)    0
pH                                0
Salinity                          0
Specific Conductance              0
Turbidity                         0
Hour                              0
DayOfYear                         0
Month                             0
dtype: int64

In [50]:
from scipy import stats  # Import the stats module from scipy

def remove_outliers_zscore(df, threshold=3):
    df_no_outliers = df.copy()
    columns_to_exclude = ['Hour', 'DayOfYear', 'Month']  # Columns to exclude from outlier removal

    for col in df.select_dtypes(include=np.number):
        if col not in columns_to_exclude:  # Check if the column should be processed
            z_scores = np.abs(stats.zscore(df[col]))
            df_no_outliers = df_no_outliers[z_scores < threshold]

    return df_no_outliers

# Example usage
df = remove_outliers_zscore(df)
df.shape

  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]
  df_no_outliers = df_no_outliers[z_scores < threshold]


(28502, 13)

In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Scale the data and convert back to DataFrame
scaled_data = scaler.fit_transform(df)
df = pd.DataFrame(scaled_data, columns=df.columns, index=df.index)

# Now you can use DataFrame methods
df.head()

Unnamed: 0_level_0,Average Water Speed,Average Water Direction,Chlorophyll,Temperature,Dissolved Oxygen,Dissolved Oxygen (%Saturation),pH,Salinity,Specific Conductance,Turbidity,Hour,DayOfYear,Month
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-08-04 23:00:00,-1.120456,-0.323561,-0.603827,-1.428923,1.178197,0.603918,1.201306,1.409796,1.122769,-0.66997,1.664538,0.484837,0.592563
2023-08-04 23:30:00,-1.075374,-0.006426,-0.453122,-1.437394,1.170822,0.588776,1.18998,1.411576,1.124315,-0.704747,1.664538,0.484837,0.592563
2023-08-05 00:00:00,-0.585918,-1.208621,-0.768132,-1.472007,1.2135,0.607401,1.088043,1.427221,1.137813,-0.674244,-1.65972,0.493278,0.592563
2023-08-05 00:30:00,-1.102803,-0.812174,-0.774458,-1.477637,1.189023,0.577398,1.167327,1.427922,1.138266,-0.69335,-1.65972,0.493278,0.592563
2023-08-05 01:00:00,-0.526051,-1.198428,-0.732709,-1.488271,1.204086,0.584133,1.18998,1.431425,1.141433,-0.704412,-1.515187,0.493278,0.592563


In [52]:
from sklearn.decomposition import PCA
import pandas as pd
# Perform PCA
pca = PCA(n_components=0.95)
df = pca.fit_transform(df)
df = pd.DataFrame(df)  # Use the original column names
df.shape


(28502, 9)

In [53]:
# Specify the file path and name
name = 'preprocessed_water_quality.csv'  

# Save the DataFrame to a CSV file
df.to_csv(name, index=True)  

print(f"DataFrame successfully saved to {name}")

DataFrame successfully saved to preprocessed_water_quality.csv
