Data Processing

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

In [5]:
# Load the data
data = pd.read_csv('merged_data.csv')  # Replace 'your_dataset.csv' with the path to your dataset file
"""
# Examine the data structure
print("Data Dimensions:", data.shape)  # Show number of rows and columns
print("\nData Info:")
print(data.info())  # Display data types and non-null counts
print("\nSummary Statistics:")
print(data.describe())  # Summary statistics for numerical columns
"""
# Display the first few rows of the dataset
print("\nFirst Few Rows:")
print(data.head())

# Identify target and feature columns
# Replace 'target_column_name' with the actual name of your target column
target_column = 'WVHT'
features = [col for col in data.columns if col != target_column]

print("\nTarget Column:", target_column)
print("Feature Columns:", features)

print("Number of rows:", data.shape[0])
print("Number of columns:", data.shape[1])



First Few Rows:
      #YY   MM   DD   hh    mm  WDIR  WSPD  GST  WVHT   DPD  ...    PRES  \
0     NaN  NaN  NaN  NaN   NaN   NaN   NaN  NaN   NaN   NaN  ...     NaN   
1  2020.0  1.0  1.0  0.0   0.0  69.0   5.5  7.1   1.3  14.3  ...  1014.4   
2  2020.0  1.0  1.0  0.0  10.0  64.0   4.5  5.5  99.0  99.0  ...  1014.4   
3  2020.0  1.0  1.0  0.0  20.0  63.0   4.7  5.8  99.0  99.0  ...  1014.5   
4  2020.0  1.0  1.0  0.0  30.0  72.0   5.1  6.4  99.0  99.0  ...  1014.6   

   ATMP   WTMP   DEWP   VIS  TIDE  WDI  R WSP  D GST  WSP  
0   NaN    NaN    NaN   NaN   NaN  NaN    NaN    NaN  NaN  
1  26.7   28.1  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
2  26.6  999.0  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
3  26.9  999.0  999.0  99.0  99.0  NaN    NaN    NaN  NaN  
4  27.0  999.0  999.0  99.0  99.0  NaN    NaN    NaN  NaN  

[5 rows x 22 columns]

Target Column: WVHT
Feature Columns: ['#YY', 'MM', 'DD', 'hh', 'mm', 'WDIR', 'WSPD', 'GST', 'DPD', 'APD', 'MWD', 'PRES', 'ATMP', 'WTMP', 'DEWP

Select only Time directly related features

In [7]:
# 1. Make a copy of the current data
df_copy = data.copy()

# 2. Drop all columns except '#YY', 'MM', 'DD', 'hh', 'mm'
columns_to_keep = ['#YY', 'MM', 'DD', 'hh', 'mm']
df_reduced = df_copy[columns_to_keep]

# Display the resulting DataFrame
print(df_reduced)

print("Number of rows:", df_reduced.shape[0])
print("Number of columns:", df_reduced.shape[1])

# 3. Merge the new DataFrame with the target column 'WVHT'
df_merged = df_reduced.join(data['WVHT'])

# 4. to CSV
output_file = "merged_TimeWVHT_data.csv"
df_merged.to_csv(output_file, index=False)

# Display the merged DataFrame
print(f"Merged DataFrame saved to {output_file}")
print(df_merged)

           #YY    MM    DD    hh    mm
0          NaN   NaN   NaN   NaN   NaN
1       2020.0   1.0   1.0   0.0   0.0
2       2020.0   1.0   1.0   0.0  10.0
3       2020.0   1.0   1.0   0.0  20.0
4       2020.0   1.0   1.0   0.0  30.0
...        ...   ...   ...   ...   ...
206915  2023.0  12.0  31.0  23.0  10.0
206916  2023.0  12.0  31.0  23.0  20.0
206917  2023.0  12.0  31.0  23.0  30.0
206918  2023.0  12.0  31.0  23.0  40.0
206919  2023.0  12.0  31.0  23.0  50.0

[206920 rows x 5 columns]
Number of rows: 206920
Number of columns: 5
Merged DataFrame saved to merged_TimeWVHT_data.csv
           #YY    MM    DD    hh    mm  WVHT
0          NaN   NaN   NaN   NaN   NaN   NaN
1       2020.0   1.0   1.0   0.0   0.0   1.3
2       2020.0   1.0   1.0   0.0  10.0  99.0
3       2020.0   1.0   1.0   0.0  20.0  99.0
4       2020.0   1.0   1.0   0.0  30.0  99.0
...        ...   ...   ...   ...   ...   ...
206915  2023.0  12.0  31.0  23.0  10.0  99.0
206916  2023.0  12.0  31.0  23.0  20.0  99.0
20691