In [None]:
import pandas as pd
import os
import pickle
import re
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import validation_curve, GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import learning_curve, validation_curve

In [None]:
train = pd.read_csv('../DataSet/train_raw4_trans.csv')
test = pd.read_csv('../DataSet/test_raw4_trans.csv')


In [None]:
train.head()

In [None]:
train.drop(['FWI','FWI/FFMC','(DMC/FWI)/ISI','FWI/BUI'],axis=1, inplace=True)
test.drop(['FWI','FWI/FFMC','(DMC/FWI)/ISI','FWI/BUI'],axis=1,inplace=True)



In [None]:
test.head()

In [None]:
train['month_07'] = train['month_07'].astype('uint8')
train['month_08'] = train['month_08'].astype('uint8')
train['month_09'] = train['month_09'].astype('uint8')
train['Classes_not fire'] = train['Classes_not fire'].astype('uint8')

test['month_07'] = test['month_07'].astype('uint8')
test['month_08'] = test['month_08'].astype('uint8')
test['month_09'] = test['month_09'].astype('uint8')
test['Classes_not fire'] = test['Classes_not fire'].astype('uint8')

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
train['Classes_not fire'].unique()

In [None]:
test['Classes_not fire'].unique()

In [None]:
X_train = train.iloc[:,:-1]
X_test = test.iloc[:,:-1]

#X_test.head()
X_train.head()

In [None]:
y_train = train['Classes_not fire']
y_test = test[['Classes_not fire']]
y_train.head()

## Model

In [None]:
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
adaBoost = AdaBoostClassifier()
#lgbm = LGBMClassifier()
gbt = GradientBoostingClassifier()

In [None]:
clfs = {
    'LogisticRegression': lr,
    'SVC': svc,
    'KNeighbors': knn,
    'RandomForest': rf,
    'AdaBoost': adaBoost,
    
    'GradientBoostingTrees': gbt
}

In [None]:
param_grid = {
    
    'LogisticRegression':{'penalty':['elasticnet'],
                          'solver':['saga'],
                          'max_iter':[3000],
                          'l1_ratio':[1,0.8,0.6,0.5,0.4,0.2,0],                          
                          'C': [10e-3,10e-2,10e-1,1.0],
                         'class_weight': ['balanced',None]},
    
    'SVC': {'C':[10e-3,10e-2,10e-1,1.0],
           'kernel':['linear','rbf','sigmoid'],
           'gamma':['scale','auto']},
    
    'KNeighbors': {'n_neighbors':[3,5,9,11,13,15],  # n_neighbors should always be odd numbers
                     'weights': ['uniform','distance']},
    
    'RandomForest': {'n_estimators':[100,300],
                    'criterion':['gini','log_loss'],
                    'max_depth':[2,4,5,6,8,10,15,20],
                    'class_weight': ['balanced',None]},
    
    'AdaBoost': {'n_estimators': [50,60,70,90,100,110],
                'learning_rate': [10e-3,10e-2,10e-1,1.0]},
    
    'GradientBoostingTrees': {'loss': ['log_loss', 'exponential'],
                             'learning_rate': [10e-3,10e-2,10e-1,1.0],
                             'n_estimators': [100,200,250],
                             'max_depth': [2,3,4,5,6,7,8,9]},
    
           
    }

In [None]:
def train_classifier(clf,parameters):
    
    gs = GridSearchCV(clf,param_grid=parameters, cv=5,refit='roc_auc', verbose=2, scoring=['f1','roc_auc']).fit(X_train, y_train)
    #y_pred = gs.predict(X_test)    
    
    
    return gs

In [None]:
tuned_models = []

for name,clf in clfs.items():
    
    print("\nFor ",name)
    parameters = param_grid[name]
    current_best_model = train_classifier(clf,parameters)
    
    ####### Saving Trainined MOdel #####################
    
    # create the directory if it does not exist
    if not os.path.exists('../tuned_models_raw5_trans'):
        os.makedirs('../tuned_models_raw5_trans')

    filename = 'tuned_' + re.search(r'^[^\(]+', str(current_best_model.best_estimator_))[0] + '_model.pkl'
    filepath = os.path.join('../tuned_models_raw5_trans', filename)
    with open(filepath, 'wb') as file:
        pickle.dump(current_best_model, file)
            
    ##############################################################################
    
    tuned_models.append(current_best_model)

In [None]:
tuned_models;

# Model Import & Model Scores

In [None]:
svm_model = pickle.load(open('../tuned_models_raw4_trans/tuned_SVC_model.pkl','rb'))
AdaBoost_model = pickle.load(open('../tuned_models_raw4_trans/tuned_AdaBoostClassifier_model.pkl','rb'))
GBDT_model = pickle.load(open('../tuned_models_raw4_trans/tuned_GradientBoostingClassifier_model.pkl','rb'))
knn_model = pickle.load(open('../tuned_models_raw4_trans/tuned_KNeighborsClassifier_model.pkl','rb'))
lr_model = pickle.load(open('../tuned_models_raw4_trans/tuned_LogisticRegression_model.pkl','rb'))
RF_model = pickle.load(open('../tuned_models_raw4_trans/tuned_RandomForestClassifier_model.pkl','rb'))

In [None]:
#svm_model = pickle.load(open('../tuned_models_raw/tuned_SVC_model.pkl','rb'))
svm_model.feature_names_in_

In [None]:
model_scores = pd.DataFrame({
    'model': ['SVM','KNN','AdaBoost','GBDT','RF','LR'],
    'balanced_accuracy_train': [balanced_accuracy_score(y_train,svm_model.predict(X_train)),
                                balanced_accuracy_score(y_train,knn_model.predict(X_train)),
                                balanced_accuracy_score(y_train,AdaBoost_model.predict(X_train)),
                                balanced_accuracy_score(y_train,GBDT_model.predict(X_train)),
                                balanced_accuracy_score(y_train,RF_model.predict(X_train)),
                                balanced_accuracy_score(y_train,lr_model.predict(X_train)),
                               ]})


balanced_accuracy_test =  [balanced_accuracy_score(y_test,svm_model.predict(X_test)),
                           balanced_accuracy_score(y_test,knn_model.predict(X_test)),
                           balanced_accuracy_score(y_test,AdaBoost_model.predict(X_test)),
                           balanced_accuracy_score(y_test,GBDT_model.predict(X_test)),
                           balanced_accuracy_score(y_test,RF_model.predict(X_test)),
                           balanced_accuracy_score(y_test,lr_model.predict(X_test)),                           
                          ] 

f1_score_train = [f1_score(y_train,svm_model.predict(X_train)),
                  
                  f1_score(y_train,knn_model.predict(X_train)),
                  f1_score(y_train,AdaBoost_model.predict(X_train)),
                  f1_score(y_train,GBDT_model.predict(X_train)),
                  f1_score(y_train,RF_model.predict(X_train)),
                  f1_score(y_train,lr_model.predict(X_train)),
                 ]


f1_score_test = [f1_score(y_test,svm_model.predict(X_test)),
                 
                 f1_score(y_test,knn_model.predict(X_test)),
                 f1_score(y_test,AdaBoost_model.predict(X_test)),
                 f1_score(y_test,GBDT_model.predict(X_test)),
                 f1_score(y_test,RF_model.predict(X_test)),
                 f1_score(y_test,lr_model.predict(X_test)),
                ]

roc_auc_train =  [roc_auc_score(y_train,svm_model.predict(X_train)),
                  
                  roc_auc_score(y_train,knn_model.predict(X_train)),
                  roc_auc_score(y_train,AdaBoost_model.predict(X_train)),
                  roc_auc_score(y_train,GBDT_model.predict(X_train)),
                  roc_auc_score(y_train,RF_model.predict(X_train)),
                  roc_auc_score(y_train,lr_model.predict(X_train)),
                 ]

roc_auc_test = [roc_auc_score(y_test,svm_model.predict(X_test)),
                
                roc_auc_score(y_test,knn_model.predict(X_test)),
                roc_auc_score(y_test,AdaBoost_model.predict(X_test)),
                roc_auc_score(y_test,GBDT_model.predict(X_test)),
                roc_auc_score(y_test,RF_model.predict(X_test)),
                roc_auc_score(y_test,lr_model.predict(X_test)),
                ] 

precision_train =  [precision_score(y_train,svm_model.predict(X_train)),
                    
                    precision_score(y_train,knn_model.predict(X_train)),
                    precision_score(y_train,AdaBoost_model.predict(X_train)),
                    precision_score(y_train,GBDT_model.predict(X_train)),
                    precision_score(y_train,RF_model.predict(X_train)),
                    precision_score(y_train,lr_model.predict(X_train)),
                   ]

precision_test = [precision_score(y_test,svm_model.predict(X_test)),
                  
                  precision_score(y_test,knn_model.predict(X_test)),
                  precision_score(y_test,AdaBoost_model.predict(X_test)),
                  precision_score(y_test,GBDT_model.predict(X_test)),
                  precision_score(y_test,RF_model.predict(X_test)),
                  precision_score(y_test,lr_model.predict(X_test)),
                 ] 


accuracy_train =  [accuracy_score(y_train,svm_model.predict(X_train)),
                   
                   accuracy_score(y_train,knn_model.predict(X_train)),
                   accuracy_score(y_train,AdaBoost_model.predict(X_train)),
                   accuracy_score(y_train,GBDT_model.predict(X_train)),
                   accuracy_score(y_train,RF_model.predict(X_train)),
                   accuracy_score(y_train,lr_model.predict(X_train)),
                 ]


accuracy_test =  [accuracy_score(y_test,svm_model.predict(X_test)),
                  accuracy_score(y_test,knn_model.predict(X_test)),
                  accuracy_score(y_test,AdaBoost_model.predict(X_test)),
                  accuracy_score(y_test,GBDT_model.predict(X_test)),
                  accuracy_score(y_test,RF_model.predict(X_test)),
                  accuracy_score(y_test,lr_model.predict(X_test)),
                 ] 





best_score= [svm_model.best_score_, knn_model.best_score_, AdaBoost_model.best_score_,
          GBDT_model.best_score_, RF_model.best_score_, lr_model.best_score_]

#################################################################################

model_scores['balanced_accuracy_test'],model_scores['f1_train'],model_scores['f1_test'],model_scores['roc_auc_train'],model_scores['roc_auc_test'],model_scores['precision_train'],model_scores['precision_test'],model_scores['accuracy_train'],model_scores['accuracy_test'] = [balanced_accuracy_test,f1_score_train,f1_score_test,roc_auc_train,roc_auc_test, precision_train,precision_test,accuracy_train,accuracy_test]

model_scores.insert(1,'best_score',best_score)

##############################################################################
model_scores.to_csv('../tuned_models_raw5_trans//model_scores.csv',index=False)
########################

model_scores.style.highlight_max(axis=0)


In [None]:
############################################################################################
df  = pd.read_csv('../tuned_models_raw5_trans//model_scores.csv')
#df = df.drop('Unnamed: 0', axis =1)
df = df.set_index('model')
df1 = df[['f1_train','roc_auc_train','balanced_accuracy_train','precision_train','accuracy_train']]
df2 = df[['f1_test','roc_auc_test','balanced_accuracy_test','precision_test','accuracy_test']]

#########################################################
import plotly.graph_objects as go


def multi_plot(df1,df2, title, addAll = True):
    
    fig = go.Figure()
    

    for column1 in df1.columns.to_list():
        f1= fig.add_trace(
            go.Bar(
                x = df1.index,
                y = df1[column1],
                name = column1,
            )
        )

    for column2 in df2.columns.to_list():
        fig.add_trace(
            go.Bar(
                x = df2.index,
                y = df2[column2],
                name = column2
            )
        )


    button_all = [dict(label = 'Train',
                      method = 'update',
                      args = [{'visible': df1.columns.isin(df1.columns),
                               'title': 'All',
                               'showlegend':True}]),
                  dict(label = 'Select',
                      method = 'update',
                      args = [{'visible': df2.columns.isin(df2.columns),
                               'title': 'All',
                               'showlegend':True}]),
                  
                  ]
                 
                

    def create_layout_button(column1):
        return dict(label = column1,
                    method = 'update',
                    args = [{'visible': df1.columns.isin([column1]),
                             'title': column1,
                             'showlegend': True}])
    
    def create_layout_button2(column2):
        return dict(label = column2,
                    method = 'update',
                    args = [{'visible': df2.columns.isin([column2]),
                             'title': column2,
                             'showlegend': True}]
                   )
    # Update remaining layout properties

    fig.update_layout(
        updatemenus=[
 #           go.layout.Updatemenu(
 #           active = 0,
 #           buttons = ([button_all[0]] * addAll) + list(df1.columns.map(lambda column: create_layout_button(column))),
                
  #          direction="down",
  #          pad={"r": 10, "t": 10},
  #          showactive=True,
  #          x=-0.5,
  #          xanchor="left",
 #           y=1,
  #          yanchor="top"),
            
            
            go.layout.Updatemenu(
            active = 0,
                visible=True,
            buttons = ([button_all[1]] * addAll) + list(df2.columns.map(lambda column: create_layout_button2(column))),
               
            direction="right",
            pad={"r": 5, "t": 5,"l":5},
            showactive=True,
            x=-0.03,
            xanchor="left",
            y=1.1,
            yanchor="bottom"),
            
            
            
        ],
         yaxis_type="log"       
    )
    # Update remaining layout properties
    fig.update_layout(
        title_text=title,
        title_y=0.96,
        
        height=400,
        #width = 1000,
        showlegend=True,
        legend=dict(yanchor="bottom",
                                  y=-0.5,
                                  xanchor="center",
                                  x=0.5,
                                  orientation='h'),
        paper_bgcolor = "rgba(0,0,0,0)",
        plot_bgcolor = "rgba(0,0,0,0)",
        margin_autoexpand=True,
        autosize=True,
        
    )
    
    fig.show(scale=200, config= dict(displayModeBar = False))
       
    ############################################################
    
    # Writing and exporting interactive figure as html file 
    
    f1.write_html('../tuned_models_raw5_trans/Model_Scores.html',config= dict(displayModeBar = False))
        
########################################################################################

multi_plot(df1,df2, title="Model Scores")  

#https://towardsdatascience.com/how-to-create-an-interactive-dropdown-in-jupyter-322277f58a68

# Validation Curve

In [None]:
clfs = {
    'LogisticRegression': lr_model,
    'SVC': svm_model,
    'KNeighbors': knn_model,
    'RandomForest': RF_model,
    'AdaBoost': AdaBoost_model,
    
    'GradientBoostingTrees': GBDT_model
}

In [None]:
param_grid = {
    
    'LogisticRegression':{                         
                          'max_iter':[100,1000,2000,3000],
                          'l1_ratio':[1,0.8,0.6,0.5,0.4,0.2,0],                          
                          'C': [10e-3,10e-2,10e-1,1.0,10,20,30],
                         },
    
    'SVC': {'C':[10e-2,10e-1,1.0,10,20,30,40,70],
           
           
           },
    
    'KNeighbors': {'n_neighbors':[3,5,9,11,13,15,19],  # n_neighbors should always be odd numbers
                     
                  },
    
    'RandomForest': {'n_estimators':[100,300,500],
                    
                    'max_depth':[2,4,5,6,7,8,10,15,20],
                    
                    },
    
    'AdaBoost': {'n_estimators': [50,70,90,100,110,140,170],
                'learning_rate': [10e-3,10e-2,10e-1,1.0]
                },
    
    'GradientBoostingTrees': {
                             'learning_rate': [10e-3,10e-2,10e-1,1.0],
                             'n_estimators': [100,200,400,600],
                             'max_depth': [2,3,4,5,6,7,8,9,11,15,20]
                             },
          
    }

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 12))
fig.suptitle('Validation Curves for Different Classifiers:CV=3', fontsize=24, fontweight='bold', y=1.0)

axes = axes.flatten()
i = 0

for j, (clf_name, clf) in enumerate(clfs.items()):
    for k, (param_name,param_value) in enumerate(param_grid[clf_name].items()):
        ax = axes[i]
        i += 1
        
        train_scores, valid_scores = validation_curve(clf.best_estimator_,
                                                      X_train, y_train,
                                                      cv=3,
                                                      param_name=param_name,
                                                      param_range=param_value)
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        valid_mean = np.mean(valid_scores, axis=1)
        valid_std = np.std(valid_scores, axis=1)
        ax.set_title(f'{clf_name} - {param_name}')
        ax.set_xlabel(param_name)
        ax.set_ylabel('Score')
        ax.set_ylim(0.0, 1.1)
        ax.set_xscale('log') 
        lw = 2
        
        try:
            ax.semilogx(param_value, train_mean, label='Training score', color='darkorange', lw=lw)
        except:
            pass
        
        ax.fill_between(param_value, train_mean - train_std, train_mean + train_std, alpha=0.2, color='darkorange', lw=lw)
        
        try:
            ax.semilogx(param_value, valid_mean, label='Cross-validation score', color='navy', lw=lw)
        except:
            pass
        
        ax.fill_between(param_value, valid_mean - valid_std, valid_mean + valid_std, alpha=0.2, color='navy', lw=lw)
        ax.legend(loc='best')
        
plt.tight_layout()

#plt.show()

##########################################################

#fig.figure.savefig(f'../AutoViz_Plots/ValidationCurve/{name}_{key}.svg',transparent=True,dpi=300)

if not os.path.exists(f'../AutoViz_Plots/ValidationCurve_raw5_trans'):
    os.makedirs(f'../AutoViz_Plots/ValidationCurve_raw5_trans')
plt.savefig(f'../AutoViz_Plots/ValidationCurve_raw5_trans/ValidationCurve.svg',format='svg',dpi=600)
#####################################################

plt.show()

# Learning Curve

In a classification problem, the learning curve shows the performance of a model on the training set and the validation set over increasing numbers of training samples. It shows how the performance of the model changes as the training set size increases. The performance metric used for classification learning curves is usually accuracy or F1 score.

In a regression problem, the learning curve shows the performance of a model on the training set and the validation set over increasing numbers of training samples. It shows how the performance of the model changes as the training set size increases. The performance metric used for regression learning curves is usually mean squared error (MSE) or R-squared.

In [None]:
############## LEARNING CURVE #####################


#import numpy as np
#import matplotlib.pyplot as plt
#from sklearn.model_selection import learning_curve


############################################

#################################################
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))
fig.suptitle('Learning Curves: CV = 3, train_size = np.linspace(0.1, 1.0, 4)', fontsize=24, fontweight='bold', y=1.0)
axes = axes.flatten()
i = 0
###################################################


#for name, clf in {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}.items():
model = {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}


for name, clf in model.items():
    print("\nFor ",name)
    #parameters = param_grid[name]
    #gs = grid_search(clf,parameters)
    
    ax = axes[i]
    i +=1
   
    # split dataset into training and test data
    train_sizes, train_scores, test_scores = learning_curve(clf, X_train, y_train,
                                                            cv=3,
                                                            verbose=0,
                                                            random_state=100,
                                                            #scoring=scoring,
                                                            train_sizes=np.linspace(0.1, 1.0, 3),)
    
    # calculate mean and standard deviation of training and test scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # plot learning curve
    #plt.figure()
    ax.set_title(f'Learning Curve: {name}')
    ax.set_xlabel('Training examples')
    ax.set_ylabel('Score')
    ax.grid()
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r')
    ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='g')
    ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    ax.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score')
    
    ax.legend(loc='best')

plt.tight_layout()    
    
if not os.path.exists('../AutoViz_Plots/LearningCurve_raw5_trans'):
    os.makedirs('../AutoViz_Plots/LearningCurve_raw5_trans')
plt.savefig(f'../AutoViz_Plots/LearningCurve_raw5_trans/LearningCurve.svg',format='svg',dpi=500)
    
plt.show()

    
    



# Complexity Aanalysis

In [None]:
#import numpy as np
#import matplotlib.pyplot as plt
#from sklearn.model_selection import learning_curve


############################################

#################################################
fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(10, 15))
fig.suptitle(' Scalability & Performance: cv=3 for 3 equal parts of dataset', fontsize=24, fontweight='bold', y=1.0)
axes = axes.flatten()
i = 0
###################################################


#for name, clf in {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}.items():
model = {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}


for name, clf in model.items():
    print("\nFor ",name)
    
    ax = axes[i]
    ax2 = axes[i+1]
    ax3 = axes[i+2]
    i +=3
   
    # split dataset into training and test data
    train_sizes, train_scores, test_scores, fit_time, score_time = learning_curve(clf, X_train, y_train,
                                                            cv=3,
                                                            verbose=0,
                                                            random_state=100,
                                                            #scoring=scoring,
                                                            train_sizes=np.linspace(0.1, 1.0, 3),
                                                           return_times=True)
    
    # calculate mean and standard deviation of training and test scores
    fit_time_mean = np.mean(fit_time, axis=1)
    fit_time_std = np.std(fit_time, axis=1)
    
    score_time_mean = np.mean(score_time, axis=1)
    score_time_std = np.std(score_time, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    

    # plot learning curve
    #plt.figure()
    ax.set_title(f'Scalability For: {name}')
    ax2.set_title(f'Scalability For: {name}')
    ax3.set_title(f'Performance of: {name}')
    
    ax.set_xlabel('Training examples')
    ax2.set_xlabel('Training examples')
    ax3.set_xlabel('Fit Time')
    
    ax.set_ylabel('Fit Time')
    ax2.set_ylabel('Score Time')
    ax3.set_ylabel('Test Score')
    
    ax.grid()
    ax2.grid()
    ax3.grid()
    
    ax.fill_between(train_sizes, fit_time_mean - fit_time_std, fit_time_mean + fit_time_std, alpha=0.1, color='r')
    ax2.fill_between(train_sizes, score_time_mean - score_time_std, score_time_mean + score_time_std, alpha=0.1, color='g')
    ax3.fill_between(fit_time_mean, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='b')
    
    ax.plot(train_sizes, fit_time_mean, 'o-', color='r', label='Fit Time')
    ax2.plot(train_sizes, score_time_mean, 'o-', color='g', label='Score Time')
    ax3.plot(fit_time_mean, test_scores_mean, 'o-', color='b', label='Fit Time vs score')
    
    #ax.legend(loc='best')
    #ax2.legend(loc='best')

plt.tight_layout()    
    
if not os.path.exists('../AutoViz_Plots/ScalabilityPerformance_raw5_trans'):
    os.makedirs('../AutoViz_Plots/ScalabilityPerformance_raw5_trans')
plt.savefig(f'../AutoViz_Plots/ScalabilityPerformance_raw5_trans/ScalabilityPerformance.svg',format='svg',dpi=600)
    
plt.show()

    
    



# ROC UC Curve

In [None]:
#from sklearn.metrics import RocCurveDisplay

In [None]:
#################################################
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(13, 8),sharex=True, sharey=True, squeeze=True)
fig.suptitle('ROC Curve: Without Transformation', fontsize=22, fontweight='bold', y=1)
axes = axes.flatten()
i = 0
###################################################


#for name, clf in {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}.items():
model = {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}


for name, clf in model.items():
    #print("\nFor ",name)
    
    ax = axes[i]
    
    roc_display = RocCurveDisplay.from_estimator(clf.best_estimator_,X=X_test, y= y_test, ax=ax,alpha=1,color='b')
    ax.set_title(name)
    ax.legend(loc='best')
    ax.grid(color='k',alpha=0.1)
    
    i +=1
plt.tight_layout()    

#################################
if not os.path.exists('../AutoViz_Plots/ROC_Curve_raw4_trans'):
    os.makedirs('../AutoViz_Plots/ROC_Curve_raw4_trans')
plt.savefig(f'../AutoViz_Plots/ROC_Curve_raw4_trans/ROC_Curve.svg',format='svg',dpi=600)


# Confusion Metrix

In [None]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
#################################################
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 8), squeeze=True)
fig.suptitle('Confusion Metrix: With Transformation', fontsize=22, fontweight='bold', y=1)
axes = axes.flatten()
i = 0
###################################################


#for name, clf in {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}.items():
model = {'SVM':svm_model, "AdaBoost":AdaBoost_model, "GBDT":GBDT_model,"KNN":knn_model,"LogReg":lr_model,"RF":RF_model}


for name, clf in model.items():
    #print("\nFor ",name)
    
    ax = axes[i]
    
    confusionMetrix = ConfusionMatrixDisplay.from_estimator(clf.best_estimator_, X=X_test, y= y_test, ax=ax, colorbar=False)
    ax.set_title(name)
    
    i +=1

plt.tight_layout()    

#################################
if not os.path.exists('../AutoViz_Plots/ConfusionMetrix_raw4_trans'):
    os.makedirs('../AutoViz_Plots/ConfusionMetrix_raw4_trans')
plt.savefig(f'../AutoViz_Plots/ConfusionMetrix_raw4_trans/ConfusionMetrix_raw4_trans.svg',format='svg',dpi=600)
########
plt.show()

# <center>Conclusions: Classifiaction Problem</center>

- FWI is calculated from FFMC, DMC, ISI, BUI and these features hss very strong correlation reaching to 9 or more so decided to go with following strategie:

    - Selected Temperature, Rain, RH, DC and FWI and month
    - I understand standardization and some transformation like log(x+1) transformation is need for FWI and DC
    - I trained the model with standardization & transfromation, with only standardization, without standardization or transformation
    - Original features can't be accomodated because of high correlaion. Since FWI is calculated from other features so including those features and excluding the FWI will only complicate the model and there will be no improvement in perfromance. I also perfromed feature engineering so that I can include other features.
    - The generated features include: FWI/FFMC,  (DMC/FWi)/ISI, FWI/BUI
    - I trained the model with these features following same strategie i.e with and without transformation.
    - The best perfroming models in terms of F1, roc_auc, precision, balanced accuracy etc are ONly the  model withoutn any transfromation and standardisaion with features DC and FWI. However the second best model was with new features with standardization and transformation.