In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv(r'C:\Users\kmebr\Documents\data_analytics_project_fall_2023\DAEN-Contrail-Preventers\data\output.csv')
df = df[['TEMP(F)', 'RH_ICE', 'PRESS', '30_pred']]
X_train, X_test, y_train, y_test = train_test_split(df[['TEMP(F)', 'RH_ICE', 'PRESS']], df['30_pred'], test_size=0.2, random_state=1693)
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
param_rf = {'max_depth':[3,5,10, 15, 20],
            'n_estimators':[10,50, 100,150,200,250,300],
            'min_samples_leaf':[1,2,3],
            'min_samples_split':[1,2,3]}

param_svm = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

param_xgboost = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05],
    'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
}

param_mlp = {
    'hidden_layer_sizes': [(150,100,50, 25, 10), (120,80,40, 20, 10), (100,50,30, 15), (150,100,50), (120,80,40), (100,50,30), (150,100,50, 25), (120,80,40, 20)],
    'max_iter': [50, 100, 150, 300, 400],
    'activation': ['tanh', 'relu', 'sigmoid'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [None]:
grid_rf = GridSearchCV(RandomForestClassifier(),param_grid=param_rf,scoring='f1', verbose=3)
model_rf = grid_rf.fit(X_train,y_train)
print(model_rf.best_params_)
print(model_rf.best_score_)

In [None]:
model_rf = RandomForestClassifier(max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200).fit(X_train, y_train)

In [None]:
grid_svm = GridSearchCV(SVC(),param_grid=param_svm,scoring='f1', verbose=3,)
model_svm = grid_svm.fit(X_train,y_train)
print(model_svm.best_params_)
print(model_svm.best_score_)

In [None]:
grid_xgboost = GridSearchCV(XGBClassifier(objective= 'binary:logistic',nthread=4), param_grid=param_xgboost, scoring = 'f1', n_jobs = 10, verbose=3)
model_xgboost = grid_xgboost.fit(X_train, y_train)
print(model_xgboost.best_params_)
print(model_xgboost.best_score_)

In [None]:
model_xgboost = XGBClassifier(objective= 'binary:logistic',nthread=4, colsample_bytree = 1, gamma=.5, learning_rate=.1, max_depth=7, min_child_weight=1, n_estimators=140, subsample=.8).fit(X_train, y_train)

In [None]:
sc=StandardScaler()
scaler = sc.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
grid_mlp = GridSearchCV(MLPClassifier(), param_grid=param_mlp, n_jobs= -1, scoring='f1', verbose=3)
model_mlp = grid_mlp.fit(X_train_scaled, y_train)
print(model_mlp.best_params_)
print(model_mlp.best_score_)

In [None]:
sc=StandardScaler()
scaler = sc.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_mlp = MLPClassifier(activation='relu', alpha=.0001, hidden_layer_sizes=(100,50,30,15), learning_rate='adaptive', max_iter=400, solver='adam').fit(X_train_scaled, y_train)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt   
from sklearn.metrics import confusion_matrix

# Create the confusion matrix
cm = [[313,102],[40,34]]#confusion_matrix(y_test, model_mlp.predict(X_test_scaled))  

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="Blues");  #annot=True to annotate cells, ftm='g' to disable scientific notation
sns.set(font_scale=2.0) 

# labels, title and ticks
ax.set_xlabel('Predicted Labels');ax.set_ylabel('True Labels'); 
title_font = {'size':'21'}
ax.set_title('Multilayer Perception',fontdict=title_font); 
ax.xaxis.set_ticklabels(['No Contrail', 'Contrail']); 
ax.yaxis.set_ticklabels(['No Contrail', 'Contrail'])
label_font = {'size':'18'}  # Adjust to fit
ax.set_xlabel('Predicted Labels', fontdict=label_font);
ax.set_ylabel('Observed Labels', fontdict=label_font);
ax.tick_params(axis='both', which='major', labelsize=18) 

In [None]:
accuracy = (cm[0, 0] + cm[1, 1]) / np.sum(cm)

print(accuracy)

In [None]:
f1_score(y_test, model_mlp.predict(X_test_scaled))

In [None]:
import scikitplot as skplt

In [None]:
skplt.metrics.plot_roc(y_test, model_rf.predict_proba(X_test), title = 'ROC Curve')

In [None]:
skplt.metrics.plot_precision_recall(y_test, model_rf.predict_proba(X_test), title = 'Precision-Recall Curve')

In [None]:
skplt.metrics.plot_cumulative_gain(y_test, model_rf.predict_proba(X_test), title = 'Cumulative Gains Chart')

In [None]:
skplt.metrics.plot_lift_curve(y_test, model_rf.predict_proba(X_test), title = 'Lift Curve')