#### Group Information

Group No: Climate_3

- Member 1: GWEE PER MING
- Member 2: LIM CHIN FENG
- Member 3: OOI YUE SHENG
- Member 4: OOI YONG QIN


#### Import libraries

In [4]:
# %pip install mlxtend --upgrade --user

In [5]:
%config Completer.use_jedi=False

# Import the libraries and modules that will be used in this project
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree,export_text

#### Load the dataset

In [6]:
#read the climate dataset csv file 
climate = pd.read_csv('climate.csv')
climate.sort_values('outcome', inplace=True)
print(climate)

     Study  Run  vconst_corr  vconst_2  vconst_3  vconst_4  vconst_5  \
0        1    1     0.859036  0.927825  0.252866  0.298838  0.170521   
48       1   49     0.728289  0.783255  0.959955  0.054978  0.422302   
57       1   58     0.964653  0.867315  0.436953  0.070029  0.144875   
62       1   63     0.616469  0.741596  0.781656  0.033420  0.239747   
479      3  120     0.740677  0.621174  0.961412  0.049750  0.947234   
..     ...  ...          ...       ...       ...       ...       ...   
184      2    5     0.102744  0.752122  0.669128  0.703405  0.248705   
182      2    3     0.667951  0.960458  0.375774  0.887335  0.405682   
181      2    2     0.161031  0.314261  0.027817  0.273993  0.346503   
142      1  143     0.751171  0.055183  0.876063  0.031708  0.953696   
539      3  180     0.608075  0.031556  0.598264  0.794771  0.145680   

     vconst_7   ah_corr  ah_bolus  ...  efficiency_factor  tidal_mix_max  \
0    0.735936  0.428325  0.567947  ...           0.245675  

In [7]:
climate.info()
climate.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 540 entries, 0 to 539
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Study                 540 non-null    int64  
 1   Run                   540 non-null    int64  
 2   vconst_corr           540 non-null    float64
 3   vconst_2              540 non-null    float64
 4   vconst_3              540 non-null    float64
 5   vconst_4              540 non-null    float64
 6   vconst_5              540 non-null    float64
 7   vconst_7              540 non-null    float64
 8   ah_corr               540 non-null    float64
 9   ah_bolus              540 non-null    float64
 10  slm_corr              540 non-null    float64
 11  efficiency_factor     540 non-null    float64
 12  tidal_mix_max         540 non-null    float64
 13  vertical_decay_scale  540 non-null    float64
 14  convect_corr          540 non-null    float64
 15  bckgrnd_vdc1          5

(540, 21)

In [8]:
columnType = ['int64','float64','object']
numColumns=list(climate.select_dtypes(include=columnType).columns)
data=climate[numColumns]
data.shape

(540, 21)

#### Split the dataset
Split the dataset into training, validation and test sets.

In [9]:
col = climate.drop(columns=['outcome','Study','Run']).columns
# col = climate.drop(columns='outcome').columns
y = climate['outcome']
X = climate[col]

In [10]:
seed_num = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=seed_num)
X_train,X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed_num)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(345, 18)
(87, 18)
(108, 18)


#### Data preprocessing
Perform data preprocessing such as normalization, standardization, label encoding etc.
______________________________________________________________________________________
Description:

In [11]:
# from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler() # normalize data 0-1
X_train_norm = scaler.fit_transform(X_train)
X_train_norm = pd.DataFrame(X_train_norm)
# X_train_norm.rename(col)

# create a dictionary of the original and new column names
col_dict = {old_name:new_name for old_name, new_name in zip(X_train_norm.columns, col)}

# rename the columns of the DataFrame using the dictionary
X_train_norm = X_train_norm.rename(columns=col_dict)


print(X_train_norm)

     vconst_corr  vconst_2  vconst_3  vconst_4  vconst_5  vconst_7   ah_corr  \
0       0.086613  0.569960  0.814199  0.112507  0.571352  0.723524  0.854969   
1       0.083660  0.701891  0.412384  0.342099  0.319628  0.732747  0.192892   
2       0.488921  0.927233  0.755736  0.872109  0.916482  0.475445  0.384935   
3       0.206774  0.855238  0.273855  0.852532  0.717980  0.084972  0.975288   
4       0.917343  0.383566  0.582905  0.658446  0.870104  0.660281  0.986264   
..           ...       ...       ...       ...       ...       ...       ...   
340     0.160832  0.548621  0.151190  0.655261  0.139755  0.798871  0.403534   
341     0.076153  0.711244  0.884965  0.121547  0.906154  0.233794  0.364506   
342     0.266311  0.081081  0.483240  0.230765  0.825838  0.526178  0.948664   
343     0.205391  0.321230  0.615433  0.006624  0.470800  0.383245  0.883003   
344     0.864225  0.014840  0.550496  0.131010  0.025591  0.795462  0.758247   

     ah_bolus  slm_corr  efficiency_fac

#### Feature Selection
Perform feature selection to select the relevant features.
______________________________________________________________________________________
Description:

In [12]:
def correlation(dataset,threshold):
    col_corr=set()
    corr_matrix=pd.DataFrame(dataset).corr() 
    for i in range(len(corr_matrix.columns)): 
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold: 
                colName=corr_matrix.columns[i] 
                col_corr.add(colName) 
    return col_corr 

col=correlation(X_train,0.8)
if (len(col) == 0):
    print('No columns that are hightly collerated with each others')
else:
    print('Correlated columns:',col)  
    
#after dopped correlated column
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

No columns that are hightly collerated with each others
(345, 18)
(87, 18)
(108, 18)


#### Feature Selection for Decision Tree

In [13]:
# from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# from sklearn.ensemble import RandomForestClassifier

# Convert X_train_norm to a Pandas DataFrame
# X_train_norm_df = pd.DataFrame(X_train_norm, columns=X_train.columns)

feature_selector = sfs(RandomForestClassifier(n_jobs=-1, random_state=seed_num), k_features=(5,18), forward=True, floating=False, verbose=2, scoring='accuracy', cv=2)
sfs = feature_selector.fit(X_train_norm, y_train)
selected_features_decisiontree = X_train.columns[list(sfs.k_feature_idx_)]
selected_features_decisiontree

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    8.2s finished

[2023-05-04 18:20:28] Features: 1/18 -- score: 0.8725131066003495[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    4.2s finished

[2023-05-04 18:20:32] Features: 2/18 -- score: 0.9130427476811399[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    3.9s finished

[2023-05-04 18:20:36] Features: 3/18 -- score: 0.918856701169512[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

Index(['vconst_corr', 'vconst_2', 'slm_corr', 'vertical_decay_scale',
       'convect_corr', 'bckgrnd_vdc1', 'bckgrnd_vdc_eq', 'bckgrnd_vdc_psim',
       'Prandtl'],
      dtype='object')

In [14]:
selected_features_decisiontree =  ['vconst_corr', 'vconst_2', 'slm_corr', 'vertical_decay_scale',
       'convect_corr', 'bckgrnd_vdc1', 'bckgrnd_vdc_eq', 'bckgrnd_vdc_psim',
       'Prandtl']

#### Feature Selection for SVM

In [16]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# from sklearn.svm import SVC
feature_selector_svm = sfs(SVC(kernel='poly', degree=2, gamma='scale', coef0=1, C=5, random_state=seed_num), k_features=(5,len(X_train.columns)), forward=True, floating=False, verbose=2, scoring='accuracy', cv=2)
sfs = feature_selector_svm.fit(X_train_norm, y_train)
selected_features_svm =X_train.columns[list(sfs.k_feature_idx_)]
selected_features_svm

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.0s finished

[2023-05-04 18:21:06] Features: 1/18 -- score: 0.9130427476811399[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    0.0s finished

[2023-05-04 18:21:06] Features: 2/18 -- score: 0.915949724425326[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.0s finished

[2023-05-04 18:21:06] Features: 3/18 -- score: 0.9391215217099073[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

Index(['vconst_corr', 'vconst_2', 'slm_corr', 'efficiency_factor',
       'vertical_decay_scale', 'convect_corr', 'bckgrnd_vdc1',
       'bckgrnd_vdc_eq'],
      dtype='object')

In [17]:
selected_features_svm =  ['vconst_corr', 'vconst_2', 'slm_corr', 'efficiency_factor',
       'vertical_decay_scale', 'convect_corr', 'bckgrnd_vdc1',
       'bckgrnd_vdc_eq']

#### Data modeling
Build the machine learning models. You must build atleast two (2) predictive models. One of the predictive models must be either Decision Tree or Support Vector Machine.
______________________________________________________________________________________
Description:

#### Data modeling for Decision Tree

##### Data modeling for Decision Tree (gini)

In [19]:
decisiontree_selected_features = selected_features_decisiontree
selected_features_decisiontree = list(selected_features_decisiontree)
print(selected_features_decisiontree)
X_train_norm_decisiontree = X_train_norm[selected_features_decisiontree]
X_val_decisiontree = X_val[selected_features_decisiontree]
X_test_decisiontree = X_test[selected_features_decisiontree]

['vconst_corr', 'vconst_2', 'slm_corr', 'vertical_decay_scale', 'convect_corr', 'bckgrnd_vdc1', 'bckgrnd_vdc_eq', 'bckgrnd_vdc_psim', 'Prandtl']


In [20]:
#using gini for information gain 
from sklearn.tree import DecisionTreeClassifier
model_dt_gini= DecisionTreeClassifier(criterion='gini',random_state=seed_num)
model_dt_gini.fit(X_train_norm_decisiontree, y_train)

In [21]:
#using gini to find the optimum depth of the decision tree which has the highest accuracy
dt_models = []
depths = np.arange(3,10,1)
for d in depths:
    model = DecisionTreeClassifier(criterion='gini', min_samples_split=15 ,max_depth=d, random_state=seed_num)
    model.fit(X_train_norm_decisiontree, y_train)
    dt_models.append(model)
best_acc = 0
model_best_gini = None
for m,d in zip(dt_models, depths):
    acc = m.score(X_val_decisiontree, y_val)
    print(f'Decision Tree classifier with max_depth={d} achieves a mean accuracy of {acc}')
    if acc > best_acc:
        best_acc = acc
        model_best_gini = m
        
model_best_gini

Decision Tree classifier with max_depth=3 achieves a mean accuracy of 0.896551724137931
Decision Tree classifier with max_depth=4 achieves a mean accuracy of 0.9080459770114943
Decision Tree classifier with max_depth=5 achieves a mean accuracy of 0.9195402298850575
Decision Tree classifier with max_depth=6 achieves a mean accuracy of 0.8850574712643678
Decision Tree classifier with max_depth=7 achieves a mean accuracy of 0.8850574712643678
Decision Tree classifier with max_depth=8 achieves a mean accuracy of 0.8850574712643678
Decision Tree classifier with max_depth=9 achieves a mean accuracy of 0.8850574712643678


##### Data modeling for Decision Tree (gini)

In [22]:
#using entropy 
# from sklearn.tree import DecisionTreeClassifier
model_dt_entropy= DecisionTreeClassifier(criterion='entropy',random_state=seed_num)
model_dt_entropy.fit(X_train_norm_decisiontree, y_train)

In [23]:
#using entropy to find the optimum depth of the decision tree which has the highest accuracy
dt_models = []
depths = np.arange(3,10,1)
for d in depths:
    model = DecisionTreeClassifier(criterion='entropy', min_samples_split=15 ,max_depth=d, random_state=seed_num)
    model.fit(X_train_norm_decisiontree, y_train)
    dt_models.append(model)
best_acc = 0
model_best_entropy = None
for m,d in zip(dt_models, depths):
    acc = m.score(X_val_decisiontree, y_val)
    print(f'Decision Tree classifier with max_depth={d} achieves a mean accuracy of {acc}')
    if acc > best_acc:
        best_acc = acc
        model_best_entropy = m
model_best_entropy

Decision Tree classifier with max_depth=3 achieves a mean accuracy of 0.9195402298850575
Decision Tree classifier with max_depth=4 achieves a mean accuracy of 0.9195402298850575
Decision Tree classifier with max_depth=5 achieves a mean accuracy of 0.9195402298850575
Decision Tree classifier with max_depth=6 achieves a mean accuracy of 0.896551724137931
Decision Tree classifier with max_depth=7 achieves a mean accuracy of 0.896551724137931
Decision Tree classifier with max_depth=8 achieves a mean accuracy of 0.896551724137931
Decision Tree classifier with max_depth=9 achieves a mean accuracy of 0.896551724137931


#### Data modeling for SVM

In [24]:
selected_features_svm = list(selected_features_svm)
print(selected_features_svm)
X_train_norm_svm = X_train_norm[selected_features_svm]
X_val_svm = X_val[selected_features_svm]
X_test_svm = X_test[selected_features_svm]

['vconst_corr', 'vconst_2', 'slm_corr', 'efficiency_factor', 'vertical_decay_scale', 'convect_corr', 'bckgrnd_vdc1', 'bckgrnd_vdc_eq']


In [25]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score,  confusion_matrix, classification_report, f1_score

kernel = ['linear', 'poly', 'rbf', 'sigmoid']
best_option_svm = {'kernel': None ,
                    'C': 0,
                    'weighted avg f1-score': 0,
                    'degree': 0
                    }

# repeatly test different kernel with different C value ranging from 1 to 10
for kernel in kernel:
    # len(X_train.columns)+2 is used as max so that the degree can reach until (no. of features + 1) dimensions
    max_degree = len(X_train_norm_svm.columns)+2 if kernel == 'poly' else 1

    # print statement below is used for debugging
    # print(kernel)
    
    for degree in range(1,max_degree):
        for c in range(1,11):
            # print statement below is used for debugging
            # print('Degree: {} \t C: {}\n'.format(degree,c))

            svm = SVC(kernel=kernel, C=c,degree=degree)
            svm.fit(X_train_norm_svm, y_train)
            yhat = svm.predict(X_val_svm)
            f1 = f1_score(y_val, yhat, average='weighted')
            # using weighted average f1-score to find the best option instead of accuracy due to imbalance of classes
            if f1 > best_option_svm['weighted avg f1-score']:
                best_option_svm['kernel'] = kernel
                best_option_svm['C'] = c
                best_option_svm['weighted avg f1-score'] = f1
                best_option_svm['degree'] = (degree if kernel == 'poly' else None)

#Display the best kernel with the best C-value
best_option_svm

NameError: name 'f1_score' is not defined

#### Evaluate the models
Perform a comparison between the predictive models. <br>
Report the accuracy, recall, precision and F1-score measures as well as the confusion matrix if it is a classification problem. <br>
Report the R2 score, mean squared error and mean absolute error if it is a regression problem.
______________________________________________________________________________________
Description:

#### Evaluate the Performance of Decision Tree (gini and entropy)

##### gini

In [None]:
#Performance after apply the optimum depth of the tree which using gini
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# from matplotlib import pyplot as plt

y_pred_gini = model_best_gini.predict(X_test_decisiontree)
print(model_best_gini)
print('\nAccuracy : {}'.format(accuracy_score(y_test,y_pred_gini)))
print('Confusion Matrix : \n{}\n'.format(confusion_matrix(y_test,y_pred_gini)))
print('Classification Report : ')
print(classification_report(y_test,y_pred_gini))

cm = confusion_matrix(y_test, y_pred_gini, labels=model_best_gini.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

##### entropy

In [None]:
#Performance after apply the optimum depth of the tree which using entropy
y_pred_entropy= model_best_entropy.predict(X_test_decisiontree)
print(model_best_entropy)
print('\nAccuracy : {}'.format(accuracy_score(y_test,y_pred_entropy)))
print('Confusion Matrix : \n{}\n'.format(confusion_matrix(y_test,y_pred_entropy)))
print('Classification Report : ')
print(classification_report(y_test,y_pred_entropy))

cm = confusion_matrix(y_test, y_pred_entropy, labels=model_best_entropy.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

In [None]:
# from sklearn.tree import plot_tree,export_text

print(export_text(model_best_gini, feature_names=list(X[decisiontree_selected_features].columns)))

In [None]:
plt.figure(figsize=(50,50),dpi=400)
plot_tree(model_best_gini, feature_names=list(X[decisiontree_selected_features].columns),fontsize=10)
plt.show()

##### SVM

In [None]:
model = SVC(kernel=best_option_svm['kernel'],C=best_option_svm['C'],degree = best_option_svm['degree'])
model.fit(X_train_norm_svm,y_train)
yhat = model.predict(X_test_svm)

print(best_option_svm)
print('\nAccuracy : {}'.format(accuracy_score(y_test, yhat)))
print('Confusion Matrix : \n{}\n'.format(confusion_matrix(y_test, yhat)))
print('Classification Report : ')
print(classification_report(y_test, yhat))

In [None]:
cm = confusion_matrix(y_test, yhat)
display = ConfusionMatrixDisplay(confusion_matrix=cm)
display.plot(include_values=True, cmap='viridis')