In [130]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler


feature_file_paths = [
    'dataset/46013h2014.txt', 'dataset/46013h2015.txt', 'dataset/46013h2016.txt', 'dataset/46013h2017.txt', 'dataset/46013h2018.txt', 'dataset/46013h2019.txt', 'dataset/46013h2021.txt', 'dataset/46013h2022.txt',
    'dataset/46026h2014.txt', 'dataset/46026h2015.txt', 'dataset/46026h2016.txt', 'dataset/46026h2017.txt', 'dataset/46026h2018.txt', 'dataset/46026h2019.txt', 'dataset/46026h2020.txt', 'dataset/46026h2021.txt', 'dataset/46026h2022.txt',
    'dataset/46237h2014.txt', 'dataset/46237h2015.txt', 'dataset/46237h2016.txt', 'dataset/46237h2017.txt', 'dataset/46237h2018.txt', 'dataset/46237h2019.txt', 'dataset/46237h2020.txt', 'dataset/46237h2021.txt', 'dataset/46237h2022.txt',
    'dataset/ftpc1h2014.txt', 'dataset/ftpc1h2015.txt', 'dataset/ftpc1h2016.txt', 'dataset/ftpc1h2017.txt', 'dataset/ftpc1h2018.txt', 'dataset/ftpc1h2019.txt', 'dataset/ftpc1h2020.txt', 'dataset/ftpc1h2021.txt', 'dataset/ftpc1h2022.txt',
    'dataset/pxoc1h2014.txt', 'dataset/pxoc1h2015.txt', 'dataset/pxoc1h2016.txt', 'dataset/pxoc1h2017.txt', 'dataset/pxoc1h2018.txt', 'dataset/pxoc1h2019.txt', 'dataset/pxoc1h2020.txt', 'dataset/pxoc1h2021.txt', 'dataset/pxoc1h2022.txt',
    'dataset/pxsc1h2014.txt', 'dataset/pxsc1h2015.txt', 'dataset/pxsc1h2016.txt', 'dataset/pxsc1h2017.txt', 'dataset/pxsc1h2018.txt', 'dataset/pxsc1h2019.txt', 'dataset/pxsc1h2020.txt', 'dataset/pxsc1h2021.txt', 'dataset/pxsc1h2022.txt',
    'dataset/tibc1h2015.txt', 'dataset/tibc1h2016.txt', 'dataset/tibc1h2017.txt', 'dataset/tibc1h2018.txt', 'dataset/tibc1h2019.txt', 'dataset/tibc1h2020.txt', 'dataset/tibc1h2021.txt', 'dataset/tibc1h2022.txt'
]


# dataframe for all feature data
all_feature_data = pd.DataFrame()

# load all feature files and create timestamp column
for feature_file_path in feature_file_paths:
    feature_data = pd.read_csv(feature_file_path, delim_whitespace=True, skiprows=[1])  
    feature_data['timestamp'] = pd.to_datetime(feature_data[['#YY', 'MM', 'DD', 'hh']].astype(str).agg(' '.join, axis=1), format='%Y %m %d %H')
    feature_data['year'] = feature_data['timestamp'].dt.year
    feature_data['month'] = feature_data['timestamp'].dt.month
    feature_data['day'] = feature_data['timestamp'].dt.day
    feature_data['hour'] = feature_data['timestamp'].dt.hour
    all_feature_data = pd.concat([all_feature_data, feature_data], axis=0, ignore_index=True)

    
print(all_feature_data.shape[0])



# Define the missing value patterns
missing_patterns = [99.00, 999, 999.0, 99.0]

# Loop through each column and replace each pattern with NaN
for column in all_feature_data.columns:
    for pattern in missing_patterns:
        all_feature_data[column] = all_feature_data[column].replace(to_replace=pattern, value=np.nan, regex=True)

missing_data = all_feature_data.isnull().sum()

print(missing_data)
    
# Setting threshold for excessive missing values (e.g., 60%)
threshold = 0.6 * len(all_feature_data)

# Drop columns with missing values greater than the threshold
all_feature_data.dropna(axis=1, thresh=threshold, inplace=True)

# Drop rows with any missing values
all_feature_data.dropna(axis=0, inplace=True)







# target file path and read the file
target_file_path = 'dataset/storm_data_search_results.csv' 
target_data = pd.read_csv(target_file_path, sep=',') 

# the columns below are useful for target
selected_columns = ['BEGIN_DATE', 'BEGIN_TIME', 'EVENT_TYPE']
target_data = target_data[selected_columns]

# convert event type values to yes
target_data['EVENT_TYPE'] = 'yes'



# create timestamp column for the target file
target_data['timestamp'] = pd.to_datetime(target_data[['BEGIN_DATE', 'BEGIN_TIME']].astype(str).agg(' '.join, axis=1), format='%m/%d/%Y %H%M')
target_data['year'] = target_data['timestamp'].dt.year
target_data['month'] = target_data['timestamp'].dt.month
target_data['day'] = target_data['timestamp'].dt.day
target_data['hour'] = target_data['timestamp'].dt.hour

#print(target_data['hour'])

#target_data['EVENT_TYPE'].fillna('no', inplace=True)


'''label_encoder = LabelEncoder()

for column in target_data.select_dtypes(include=['object']).columns:
    target_data[column] = label_encoder.fit_transform(target_data[column])'''


# combine feature and target files
all_data = pd.merge(all_feature_data, target_data, how='left', on='timestamp')
all_data['EVENT_TYPE'].fillna('no', inplace=True)
#all_data['EVENT_TYPE'] = all_data['EVENT_TYPE'].replace(['Thunderstorm Wind', 'Storm Surge/Tide'], 'yes')
'''all_data['EVENT_TYPE'] = all_data['EVENT_TYPE'].replace({'Flood': 'yes', 'Hail': 'yes', 'Strong Wind': 'yes', 'Rip Current': 'yes',
                                                          'Wildfire': 'yes', 'Thunderstorm Wind': 'yes', 'Dense Smoke': 'yes',
                                                          'Excessive Heat': 'yes', 'Storm Surge/Tide': 'yes', 'Coastal Flood': 'yes'})'''

all_data = all_data.drop(['year_x', 'month_x', 'day_x', 'hour_x', 'year_y', 'month_y', 'day_y', 'hour_y'], axis=1)



print(all_data)



3111213
#YY                0
MM                 0
DD                 0
hh                 0
mm                 0
WDIR          992810
WSPD          986877
GST          1201764
WVHT         2829788
DPD          2829788
APD          2829787
MWD          2852039
PRES             179
ATMP          466734
WTMP         1634345
DEWP         2312132
VIS          2425237
TIDE         3111213
timestamp          0
year               0
month              0
day                0
hour               0
dtype: int64
          #YY  MM  DD  hh  mm   WDIR  WSPD   GST    PRES  ATMP  \
0        2014   1   1   0  50  313.0   5.7   6.8  1022.6  10.2   
1        2014   1   1   1  50  315.0   5.0   6.1  1022.4  10.2   
2        2014   1   1   2  50  319.0   5.6   6.6  1022.3  10.3   
3        2014   1   1   3  50  318.0   4.6   5.7  1022.4  10.4   
4        2014   1   1   4  50  314.0   4.2   5.1  1022.5  10.5   
...       ...  ..  ..  ..  ..    ...   ...   ...     ...   ...   
1677825  2022  12  31  23  36  317

In [131]:
# drop columns that is not useful for modeling. Define feature and target
X = all_data.drop(['EVENT_TYPE','timestamp','BEGIN_DATE', 'BEGIN_TIME','#YY','MM',  'DD',  'hh',  'mm'], axis=1)  
y = all_data['EVENT_TYPE'] 

yes_count = all_data['EVENT_TYPE'].value_counts().get('yes', 0)
print(yes_count)

995


In [132]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#start undersampling
train_data = pd.concat([X_train, y_train], axis=1)

# look for the minority class
minority_class_label = train_data['EVENT_TYPE'].value_counts().idxmin()

# apply random undersampling
undersampler = RandomUnderSampler(sampling_strategy='auto')
X_resampled, y_resampled = undersampler.fit_resample(train_data.drop('EVENT_TYPE', axis=1), train_data['EVENT_TYPE'])
print(y_resampled.value_counts())

# train using undersampled data
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_resampled, y_resampled)

# make predictions
test_predictions = model.predict(X_test)

# make evaluation 
accuracy = accuracy_score(y_test, test_predictions)
print(f'Accuracy on test set: {accuracy}')








  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


EVENT_TYPE
no     796
yes    796
Name: count, dtype: int64


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Accuracy on test set: 0.8504794883867853
