<a href="https://colab.research.google.com/github/hoangthuha/researchproject/blob/main/Project_B_Full_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 - Import Libraries


In [272]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from scipy.stats import norm
import warnings
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')

#2 - Read the Dataset

In [273]:
# Mount Google Drive to access all files
from google.colab import drive
drive.mount('/content/drive')
full_data = pd.read_csv("/content/drive/MyDrive/DS Capstone/Dataset/RFLFSODataFull.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [274]:
print("Full_data shape:",full_data.shape)

Full_data shape: (91379, 27)


# 3 - Data Preprocessing

In [275]:
def process_data(process_df):

    # Change unit of Frequency to GHz
    process_df['Frequency'] = process_df['Frequency'] / 10000000000

    # Encode SYNOP Code column
    process_df = pd.concat([process_df, pd.get_dummies(process_df["SYNOPCode"], prefix="SYNOP")], axis=1)
    process_df.drop("SYNOPCode", axis = 1, inplace = True)
    process_df = process_df.dropna()

    return process_df

In [276]:
full_data = process_data(full_data)

In [277]:
y = full_data[["FSO_Att","RFL_Att"]]
y.head()

Unnamed: 0,FSO_Att,RFL_Att
0,7.913289,6.927868
1,7.451176,4.412096
2,7.072747,6.26874
3,6.949288,4.317853
4,7.361052,6.114514


In [278]:
X = full_data.drop(columns = ["FSO_Att","RFL_Att"])
X.head()

Unnamed: 0,AbsoluteHumidity,AbsoluteHumidityMax,AbsoluteHumidityMin,Distance,Frequency,Particulate,ParticulateMax,ParticulateMin,RainIntensity,RainIntensityMax,...,WindSpeed,WindSpeedMax,WindSpeedMin,SYNOP_0,SYNOP_3,SYNOP_4,SYNOP_5,SYNOP_6,SYNOP_7,SYNOP_8
0,17.595709,17.615907,17.340148,2115.338398,8.35,0.0,0.0,0.0,0.0,0.0,...,3.057066,6.863808,3.007939,1,0,0,0,0,0,0
1,17.549693,17.572415,17.299439,2113.999257,7.35,0.0,0.0,0.0,0.0,0.0,...,2.72791,6.468903,2.537393,1,0,0,0,0,0,0
2,17.29023,17.644014,16.037894,2118.689047,8.35,0.0,0.0,0.0,0.0,0.0,...,1.67481,2.826916,1.640809,1,0,0,0,0,0,0
3,16.82088,17.066776,15.895622,2114.632339,7.35,0.0,0.0,0.0,0.0,0.0,...,0.962068,2.780643,0.886951,1,0,0,0,0,0,0
4,16.81382,17.953974,15.227225,2116.786055,8.35,0.0,0.0,0.0,0.0,0.0,...,1.881007,4.476298,1.874052,1,0,0,0,0,0,0


In [279]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

In [280]:
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_train:  (63965, 31)
X_test:  (27414, 31)
y_train:  (63965, 2)
y_test:  (27414, 2)


In [281]:
y_train_FSO = y_train["FSO_Att"]
y_train_FSO = y_train_FSO.to_numpy().reshape(-1,1)

y_test_FSO = y_test["FSO_Att"]
y_test_FSO = y_test_FSO.to_numpy().reshape(-1,1)

y_train_RFL = y_train["RFL_Att"]
y_train_RFL = y_train_RFL.to_numpy().reshape(-1,1)

y_test_RFL = y_test["RFL_Att"]
y_test_RFL = y_test_RFL.to_numpy().reshape(-1,1)

In [282]:
print(y_train_FSO.shape)
print(y_test_FSO.shape)
print(y_train_RFL.shape)
print(y_test_RFL.shape)

(63965, 1)
(27414, 1)
(63965, 1)
(27414, 1)


In [283]:
def predict(model, X_test_scaled, target_scaler):
    y_pred = model.predict(X_test_scaled).reshape(-1,1)
    # inverse transform
    y_pred = target_scaler.inverse_transform(y_pred)
    return y_pred

def calculate_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred , squared=False)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return {"rmse": rmse,
            "mse": mse,
            "mae": mae,
            "r2" : r2,
            }

def perform_random_forest_regression(X_train, y_train):
    regressor = RandomForestRegressor(random_state=0, oob_score=True, n_jobs=4)
    param_grid = {
    'n_estimators': [10],
    'max_depth': [20]
    }
    grid_regr = GridSearchCV(estimator = regressor, param_grid = param_grid, cv = 3)
    grid_regr.fit(X_train, y_train)

    model = grid_regr.best_estimator_
    best_params = grid_regr.best_params_

    return model


    # # estimator.feature_importances_
    # y_pred = model.predict(X_test).reshape(-1,1)
    # # inverse transform
    # y_pred = target_scaler.inverse_transform(y_pred)




In [284]:
all_metrics = []

## Method 1

In [285]:
feature_scaler = StandardScaler().fit(X_train)
X_train_scaled = feature_scaler.transform(X_train)
X_test_scaled = feature_scaler.transform(X_test)

target_scaler_FSO = StandardScaler().fit(y_train_FSO)
y_train_FSO_scaled = target_scaler_FSO.transform(y_train_FSO)

target_scaler_RFL = StandardScaler().fit(y_train_RFL)
y_train_RFL_scaled = target_scaler_RFL.transform(y_train_RFL)



In [286]:
method_1_random_forest_1_rfl = perform_random_forest_regression(X_train_scaled, y_train_RFL_scaled)
method_1_random_forest_1_rfl

y_pred_method_1_random_forest_1_rfl = predict(method_1_random_forest_1_rfl, X_test_scaled, target_scaler_RFL)

method_1_random_forest_1_rfl_metrics = calculate_metrics(y_test_RFL, y_pred_method_1_random_forest_1_rfl)
method_1_random_forest_1_rfl_metrics['method'] = 'method_1_random_forest_1_rfl'
all_metrics.append(method_1_random_forest_1_rfl_metrics)
print(method_1_random_forest_1_rfl_metrics)

# y_pred_method_1_random_forest_1_rfl = method_1_random_forest_1_rfl.predict(X_test_scaled).reshape(-1,1)
# # inverse transform
# y_pred_method_1_random_forest_1_rfl = target_scaler_RFL.inverse_transform(y_pred_method_1_random_forest_1_rfl)
# method_1_random_forest_1_rfl_metrics = calculate_metrics(y_test_RFL, y_pred_method_1_random_forest_1_rfl)
# method_1_random_forest_1_rfl_metrics

{'rmse': 0.5618192256624941, 'mse': 0.31564084232400447, 'mae': 0.2502252203092496, 'r2': 0.9738862007539714, 'method': 'method_1_random_forest_1_rfl'}


In [287]:
method_1_random_forest_2_fso = perform_random_forest_regression(X_train_scaled, y_train_FSO_scaled)
method_1_random_forest_2_fso

y_pred_method_1_random_forest_2_fso = predict(method_1_random_forest_2_fso, X_test_scaled, target_scaler_FSO)

method_1_random_forest_2_fso_metrics = calculate_metrics(y_test_FSO, y_pred_method_1_random_forest_2_fso)
method_1_random_forest_2_fso_metrics['method'] = 'method_1_random_forest_2_fso'
all_metrics.append(method_1_random_forest_2_fso_metrics)
print(method_1_random_forest_2_fso_metrics)

{'rmse': 0.9395515212955169, 'mse': 0.8827570611687201, 'mae': 0.531647848737279, 'r2': 0.9422826665311054, 'method': 'method_1_random_forest_2_fso'}


## Method 2

In [288]:
RFL_pred_train = predict(method_1_random_forest_1_rfl, X_train_scaled, target_scaler_RFL)

In [289]:
X_train_RFL = X_train.copy()
X_train_RFL["RFL_Att_predicted"] = RFL_pred_train
X_train_RFL.head()

Unnamed: 0,AbsoluteHumidity,AbsoluteHumidityMax,AbsoluteHumidityMin,Distance,Frequency,Particulate,ParticulateMax,ParticulateMin,RainIntensity,RainIntensityMax,...,WindSpeedMax,WindSpeedMin,SYNOP_0,SYNOP_3,SYNOP_4,SYNOP_5,SYNOP_6,SYNOP_7,SYNOP_8,RFL_Att_predicted
47473,8.377091,8.684211,7.584667,2021.203672,7.35,91.514433,93.1288,83.814979,0.0,0.0,...,3.237423,0.094798,0,0,0,0,1,0,0,12.748948
4567,19.925557,20.412621,17.942796,2118.082177,7.35,0.0,0.0,0.0,0.0,0.0,...,2.312692,1.261913,1,0,0,0,0,0,0,5.162695
85060,9.003206,9.397796,8.868819,2583.727829,8.35,0.0,0.0,0.0,0.0,0.0,...,2.874901,1.521214,1,0,0,0,0,0,0,12.516556
76029,6.077423,6.162482,5.713748,4827.922307,7.35,10.038827,10.358579,9.061735,0.0,0.0,...,1.309578,0.290856,0,0,0,0,1,0,0,12.353406
3367,19.438362,19.536744,18.330538,2120.799892,7.35,662.920204,664.300184,612.616428,0.0,0.0,...,1.585376,1.033983,0,0,0,0,1,0,0,7.562931


In [290]:
RFL_pred_test = predict(method_1_random_forest_1_rfl, X_test_scaled, target_scaler_RFL)

In [291]:
X_test_RFL = X_test.copy()
X_test_RFL["RFL_Att_predicted"] = RFL_pred_test
X_test_RFL.head()

Unnamed: 0,AbsoluteHumidity,AbsoluteHumidityMax,AbsoluteHumidityMin,Distance,Frequency,Particulate,ParticulateMax,ParticulateMin,RainIntensity,RainIntensityMax,...,WindSpeedMax,WindSpeedMin,SYNOP_0,SYNOP_3,SYNOP_4,SYNOP_5,SYNOP_6,SYNOP_7,SYNOP_8,RFL_Att_predicted
76065,6.305031,6.484345,6.13525,4822.99886,7.35,14.066137,14.767428,13.311811,0.0,0.0,...,0.140447,0.100434,0,0,0,0,1,0,0,12.459607
3141,19.354166,20.268171,19.247927,2113.161004,7.35,160.246757,172.162877,151.487151,1.309431,1.331148,...,2.407073,0.986797,0,0,0,0,1,0,0,9.66389
90587,19.588024,20.515327,18.769553,3964.652174,7.35,0.0,0.0,0.0,0.0,0.0,...,3.273064,0.969761,1,0,0,0,0,0,0,10.056153
90374,21.447144,22.4792,20.473301,3971.636549,8.35,0.0,0.0,0.0,0.0,0.0,...,4.257771,1.875453,1,0,0,0,0,0,0,10.762591
1553,18.041652,19.310742,16.272746,2117.867268,7.35,0.0,0.0,0.0,0.0,0.0,...,5.573189,4.281256,1,0,0,0,0,0,0,4.366363


In [292]:
feature_scaler_rfl = StandardScaler().fit(X_train_RFL)
X_train_RFL_scaled = feature_scaler_rfl.transform(X_train_RFL)
X_test_RFL_scaled = feature_scaler_rfl.transform(X_test_RFL)

method_2_random_forest_3_rfl = perform_random_forest_regression(X_train_RFL_scaled, y_train_FSO)
method_2_random_forest_3_rfl

y_pred_method_2_random_forest_3_rfl = predict(method_2_random_forest_3_rfl, X_test_RFL_scaled, target_scaler_RFL)

method_2_random_forest_3_rfl_metrics = calculate_metrics(y_test_RFL, y_pred_method_2_random_forest_3_rfl)
method_2_random_forest_3_rfl_metrics['method'] = 'method_2_random_forest_3_rfl'
all_metrics.append(method_2_random_forest_3_rfl_metrics)
print(method_2_random_forest_3_rfl_metrics)

{'rmse': 26.618357941811635, 'mse': 708.5369795184065, 'mae': 23.214134997894632, 'r2': -57.61913276273133, 'method': 'method_2_random_forest_3_rfl'}


## Method 3

In [293]:
FSO_pred_train = predict(method_1_random_forest_2_fso, X_train_scaled, target_scaler_FSO)

In [294]:
X_train_FSO = X_train.copy()
X_train_FSO["FSO_Att_predicted"] = FSO_pred_train
X_train_FSO.head()

Unnamed: 0,AbsoluteHumidity,AbsoluteHumidityMax,AbsoluteHumidityMin,Distance,Frequency,Particulate,ParticulateMax,ParticulateMin,RainIntensity,RainIntensityMax,...,WindSpeedMax,WindSpeedMin,SYNOP_0,SYNOP_3,SYNOP_4,SYNOP_5,SYNOP_6,SYNOP_7,SYNOP_8,FSO_Att_predicted
47473,8.377091,8.684211,7.584667,2021.203672,7.35,91.514433,93.1288,83.814979,0.0,0.0,...,3.237423,0.094798,0,0,0,0,1,0,0,4.705838
4567,19.925557,20.412621,17.942796,2118.082177,7.35,0.0,0.0,0.0,0.0,0.0,...,2.312692,1.261913,1,0,0,0,0,0,0,9.980338
85060,9.003206,9.397796,8.868819,2583.727829,8.35,0.0,0.0,0.0,0.0,0.0,...,2.874901,1.521214,1,0,0,0,0,0,0,13.093149
76029,6.077423,6.162482,5.713748,4827.922307,7.35,10.038827,10.358579,9.061735,0.0,0.0,...,1.309578,0.290856,0,0,0,0,1,0,0,4.92119
3367,19.438362,19.536744,18.330538,2120.799892,7.35,662.920204,664.300184,612.616428,0.0,0.0,...,1.585376,1.033983,0,0,0,0,1,0,0,24.425689


In [295]:
FSO_pred_test = predict(method_1_random_forest_2_fso, X_test_scaled, target_scaler_FSO)

In [296]:
X_test_FSO = X_test.copy()
X_test_FSO["FSO_Att_predicted"] = FSO_pred_test
X_test_FSO.head()

Unnamed: 0,AbsoluteHumidity,AbsoluteHumidityMax,AbsoluteHumidityMin,Distance,Frequency,Particulate,ParticulateMax,ParticulateMin,RainIntensity,RainIntensityMax,...,WindSpeedMax,WindSpeedMin,SYNOP_0,SYNOP_3,SYNOP_4,SYNOP_5,SYNOP_6,SYNOP_7,SYNOP_8,FSO_Att_predicted
76065,6.305031,6.484345,6.13525,4822.99886,7.35,14.066137,14.767428,13.311811,0.0,0.0,...,0.140447,0.100434,0,0,0,0,1,0,0,4.823496
3141,19.354166,20.268171,19.247927,2113.161004,7.35,160.246757,172.162877,151.487151,1.309431,1.331148,...,2.407073,0.986797,0,0,0,0,1,0,0,13.975916
90587,19.588024,20.515327,18.769553,3964.652174,7.35,0.0,0.0,0.0,0.0,0.0,...,3.273064,0.969761,1,0,0,0,0,0,0,8.348615
90374,21.447144,22.4792,20.473301,3971.636549,8.35,0.0,0.0,0.0,0.0,0.0,...,4.257771,1.875453,1,0,0,0,0,0,0,9.348194
1553,18.041652,19.310742,16.272746,2117.867268,7.35,0.0,0.0,0.0,0.0,0.0,...,5.573189,4.281256,1,0,0,0,0,0,0,10.678901


In [297]:
feature_scaler_fso = StandardScaler().fit(X_train_FSO)
X_train_FSO_scaled = feature_scaler_fso.transform(X_train_FSO)
X_test_FSO_scaled = feature_scaler_fso.transform(X_test_FSO)

method_3_random_forest_4_fso = perform_random_forest_regression(X_train_FSO_scaled, y_train_RFL)
method_3_random_forest_4_fso

y_pred_method_3_random_forest_4_fso = predict(method_3_random_forest_4_fso, X_test_FSO_scaled, target_scaler_FSO)

method_3_random_forest_4_fso_metrics = calculate_metrics(y_test_FSO, y_pred_method_3_random_forest_4_fso)
method_3_random_forest_4_fso_metrics['method'] = 'method_3_random_forest_4_fso'
all_metrics.append(method_3_random_forest_4_fso_metrics)
print(method_3_random_forest_4_fso_metrics)

{'rmse': 47.2023334277485, 'mse': 2228.0602810243436, 'mae': 45.22185726815742, 'r2': -144.67733738478987, 'method': 'method_3_random_forest_4_fso'}


In [298]:
df = pd.DataFrame(all_metrics)
df

Unnamed: 0,rmse,mse,mae,r2,method
0,0.561819,0.315641,0.250225,0.973886,method_1_random_forest_1_rfl
1,0.939552,0.882757,0.531648,0.942283,method_1_random_forest_2_fso
2,26.618358,708.53698,23.214135,-57.619133,method_2_random_forest_3_rfl
3,47.202333,2228.060281,45.221857,-144.677337,method_3_random_forest_4_fso


In [299]:
# # add new columns to X, will be used later in method 2
# print('Shape of X ', X.shape)
# X_FSO = X.copy()
# X_FSO["RFL_Att_predicted"] = y['RFL_Att']
# print('After adding FSO to X ', X_FSO.shape)
# X_FSO.head()