Data Processing

In [48]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

In [49]:
# Load the dataset
data = pd.read_csv('merged_data.csv')  # Replace with the actual path to your dataset

# Display the first few rows to inspect the structure
print(data.head())

# Show basic information about the dataset
print(data.info())

# Summarize numerical columns with descriptive statistics
print(data.describe())

# Check for missing or invalid values
print(data.isnull().sum())  # Count of NaN values in each column

      #YY   MM   DD   hh    mm  WDIR  WSPD  GST  WVHT   DPD  ...    PRES  \
0     NaN  NaN  NaN  NaN   NaN   NaN   NaN  NaN   NaN   NaN  ...     NaN   
1  2020.0  1.0  1.0  0.0   0.0  69.0   5.5  7.1   1.3  14.3  ...  1014.4   
2  2020.0  1.0  1.0  0.0  10.0  64.0   4.5  5.5  99.0  99.0  ...  1014.4   
3  2020.0  1.0  1.0  0.0  20.0  63.0   4.7  5.8  99.0  99.0  ...  1014.5   
4  2020.0  1.0  1.0  0.0  30.0  72.0   5.1  6.4  99.0  99.0  ...  1014.6   

   ATMP   WTMP   DEWP   VIS  TIDE  WDI  R WSP  D GST  WSP  
0   NaN    NaN    NaN   NaN   NaN  NaN    NaN    NaN  NaN  
1  26.7   28.1  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
2  26.6  999.0  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
3  26.9  999.0  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
4  27.0  999.0  999.0  99.0  99.0  NaN    NaN    NaN  NaN  

[5 rows x 22 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206920 entries, 0 to 206919
Data columns (total 22 columns):
 #   Column  Non-Null Count   Dtype  
---  ------

In [50]:
columns_to_keep = ['#YY', 'MM', 'DD', 'ATMP', 'WTMP', 'WVHT']
data = data[columns_to_keep]
# Drop the first row from the DataFrame
data = data.iloc[1:].reset_index(drop=True)
# Display the DataFrame to verify
print(data.head())

      #YY   MM   DD  ATMP   WTMP  WVHT
0  2020.0  1.0  1.0  26.7   28.1   1.3
1  2020.0  1.0  1.0  26.6  999.0  99.0
2  2020.0  1.0  1.0  26.9  999.0  99.0
3  2020.0  1.0  1.0  27.0  999.0  99.0
4  2020.0  1.0  1.0  27.0  999.0  99.0


Fill NA and extreme value

In [51]:
# Fill in invalid data for WTMP
# Filter the data to exclude invalid values (999) and calculate the average
filtered_data = data[data['WTMP'] != 999]
average_wtmp = filtered_data['WTMP'].mean()

# Replace invalid values (999) and NaN with the average
data['WTMP'] = data['WTMP'].replace(999, average_wtmp).fillna(average_wtmp)
print("Invalid WTMP values have been replaced with the average.")

# Fill in invalid data for ATMP
# Filter the data to exclude invalid values (999) and calculate the average
filtered_data = data[data['ATMP'] != 999]
average_atmp = filtered_data['ATMP'].mean()

# Replace invalid values (999) and NaN with the average
data['ATMP'] = data['ATMP'].replace(999, average_atmp).fillna(average_atmp)
print("Invalid ATMP values have been replaced with the average.")

# Fill in invalid data for WVHT
# Filter the data to exclude invalid values (e.g., 99.9 if that's considered invalid) and calculate the average
filtered_data = data[data['WVHT'] != 99]
average_wvht = filtered_data['WVHT'].mean()

# Replace invalid values (e.g., 99) and NaN with the average
data['WVHT'] = data['WVHT'].replace(99, average_wvht).fillna(average_wvht)
print("Invalid WVHT values have been replaced with the average.")

# Optional: Check for remaining NaN values
if data.isnull().sum().sum() > 0:
    print("Warning: There are still NaN values in the dataset.")
else:
    print("All invalid or missing values have been handled.")

Invalid WTMP values have been replaced with the average.
Invalid ATMP values have been replaced with the average.
Invalid WVHT values have been replaced with the average.


In [52]:
data

Unnamed: 0,#YY,MM,DD,ATMP,WTMP,WVHT
0,2020.0,1.0,1.0,26.7,28.100000,1.30000
1,2020.0,1.0,1.0,26.6,28.070833,1.28191
2,2020.0,1.0,1.0,26.9,28.070833,1.28191
3,2020.0,1.0,1.0,27.0,28.070833,1.28191
4,2020.0,1.0,1.0,27.0,28.070833,1.28191
...,...,...,...,...,...,...
206914,2023.0,12.0,31.0,26.5,28.070833,1.28191
206915,2023.0,12.0,31.0,26.5,28.070833,1.28191
206916,2023.0,12.0,31.0,26.6,28.070833,1.28191
206917,2023.0,12.0,31.0,26.5,28.070833,1.28191


Normalize data

In [37]:
# Select columns to normalize
columns_to_normalize = ['ATMP', 'WTMP', 'WVHT']
data_to_normalize = data[columns_to_normalize]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
normalized_data = scaler.fit_transform(data_to_normalize)

# Replace original columns with normalized values
data_normalized = data.copy()
data_normalized[columns_to_normalize] = normalized_data

# Verify the normalization
print(data_normalized.head())

      #YY   MM   DD   hh    mm  WDIR  WSPD  GST      WVHT   DPD  ...    PRES  \
0     NaN  NaN  NaN  NaN   NaN   NaN   NaN  NaN       NaN   NaN  ...     NaN   
1  2020.0  1.0  1.0  0.0   0.0  69.0   5.5  7.1  0.013131  14.3  ...  1014.4   
2  2020.0  1.0  1.0  0.0  10.0  64.0   4.5  5.5  1.000000  99.0  ...  1014.4   
3  2020.0  1.0  1.0  0.0  20.0  63.0   4.7  5.8  1.000000  99.0  ...  1014.5   
4  2020.0  1.0  1.0  0.0  30.0  72.0   5.1  6.4  1.000000  99.0  ...  1014.6   

       ATMP      WTMP   DEWP   VIS  TIDE  WDI  R WSP  D GST  WSP  
0       NaN       NaN    NaN   NaN   NaN  NaN    NaN    NaN  NaN  
1  0.005828  0.005123  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
2  0.005726  1.000000  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
3  0.006033  1.000000  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
4  0.006135  1.000000  999.0  99.0  99.0  NaN    NaN    NaN  NaN  

[5 rows x 22 columns]


Select only Time directly related features

In [36]:
# 4. to CSV
output_file = "merged_TimeWVHT_data.csv"
data.to_csv(output_file, index=False)

data = pd.read_csv("merged_TimeWVHT_data.csv")