In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

#settings
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250
sns.set(style="whitegrid")

import warnings
warnings.simplefilter('ignore')

## Save Model

In [2]:
import pickle
from sklearn import tree
# save the classifier
def save_model(model, name):
    path = r'{0}.pkl'.format(name)
    with open(path, 'wb') as fid:
        pickle.dump(model, fid) 

## Calculate accuracy and confusion matrix

In [3]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

def calculate_accuracy_confusion_matrix(model, test_features, test_labels):
    # predicting for test values
    DT_test_pred = model.predict(test_features)
    clf_DT_accuracy = accuracy_score(test_labels, DT_test_pred, normalize = True)
    print(clf_DT_accuracy)

    # Confusion Metrices for Decision Tree on train data.
    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    print("Confusion Metrices for Decision Tree")
    print("{0}".format(metrics.confusion_matrix(test_labels, DT_test_pred, labels=[0, 1])))

    print("Classification Report")
    print("{0}".format(metrics.classification_report(test_labels, DT_test_pred, labels=[0, 1])))



## Preparing data 

In [4]:
# Load Data
df = pd.read_pickle("Brain_SelectedData.pkl")
df.head()

Unnamed: 0,26,20,18,14,12,4263,4199,4192,4120,4052,3843,3842,3835,3489,3344,3145,3128,2986,2843,2790,2719,2717,2510,2497,1811,1743,1672,1663,1504,860,712,442,371,292,271,237,236,224,222,175,163,162,160,61,2,4190,3977,3915,3909,3772,3769,3767,3697,3565,3416,3348,3276,3207,3137,2999,2997,2939,2723,2722,2720,2714,2701,2519,2512,2506,2504,2499,2307,2211,1952,1948,1928,1859,1855,1785,1781,1776,1706,1671,1657,1638,1516,1428,1427,1285,1253,1228,1227,1168,1048,1019,879,866,854,791,725,713,640,553,538,523,506,438,363,361,360,340,300,295,272,270,268,267,248,245,239,238,235,234,228,214,198,197,167,166,157,154,152,144,143,91,37,34,30,28,17,15,13,3,1,Sex
0,4380.0,3720.0,3674.0,1712.0,5889.0,785.0,433.0,105.0,5054.0,0.0,0.0,1025.0,991.0,0.0,1937.0,7545.0,256.0,6005.0,123.0,4013.0,11517.0,356.0,13373.0,6326.0,19.0,8048.0,7.0,1056.0,0.0,3033.0,56.0,0.0,1846.0,1212.0,1336.0,7388.0,1206.0,0.0,706.0,5990.0,1297.0,0.0,2760.0,2178.0,1778.0,7236.0,19.0,0.0,27.0,8518.0,3582.0,44.0,5428.0,5409.0,2210.0,0.0,5752.0,518.0,53.0,173.0,62.0,0.0,21442.0,6386.0,150.0,0.0,112.0,3618.0,5115.0,77.0,244.0,3240.0,515.0,0.0,1917.0,15.0,4915.0,3033.0,4285.0,1637.0,5957.0,42.0,0.0,0.0,0.0,12578.0,4.0,3961.0,9367.0,0.0,1948.0,4.0,4.0,0.0,0.0,311.0,1295.0,4123.0,5223.0,4580.0,9863.0,402.0,580.0,37.0,0.0,45.0,2930.0,0.0,66.0,1036.0,80.0,1722.0,3259.0,43.0,1320.0,218.0,10636.0,154.0,10152.0,11962.0,1344.0,22052.0,1412.0,16102.0,7769.0,19141.0,154.0,0.0,6739.0,1484.0,0.0,0.0,211.0,10918.0,6851.0,12.0,1973.0,5906.0,9631.0,7089.0,3449.0,1937.0,4.0,18057.0,5619.0,0
1,3080.0,2153.0,3079.0,358.0,2616.0,2127.0,4.0,243.0,5171.0,0.0,245.0,1984.0,527.0,0.0,1646.0,5456.0,657.0,7469.0,479.0,2984.0,12044.0,1777.0,15696.0,2690.0,30.0,3704.0,31.0,35.0,0.0,2080.0,3.0,83.0,1136.0,117.0,25.0,6579.0,108.0,0.0,0.0,2893.0,1048.0,0.0,1662.0,2229.0,1705.0,9966.0,8.0,1.0,181.0,3089.0,4459.0,839.0,6953.0,3802.0,2946.0,28.0,6569.0,1873.0,81.0,0.0,68.0,0.0,16045.0,6776.0,1274.0,0.0,323.0,1004.0,2841.0,39.0,1.0,975.0,133.0,0.0,862.0,0.0,831.0,977.0,2124.0,805.0,3542.0,37.0,0.0,14.0,6.0,9829.0,0.0,3363.0,6350.0,242.0,2719.0,0.0,0.0,131.0,0.0,43.0,139.0,2804.0,2141.0,3106.0,4419.0,450.0,267.0,0.0,0.0,2.0,4226.0,251.0,145.0,1332.0,0.0,498.0,1461.0,12.0,1407.0,0.0,4032.0,0.0,3146.0,5047.0,1686.0,20407.0,1947.0,9516.0,7778.0,8406.0,1305.0,0.0,2866.0,390.0,1.0,0.0,17.0,7668.0,5688.0,535.0,1781.0,2705.0,6107.0,7427.0,7689.0,1881.0,41.0,9629.0,3804.0,0
2,5546.0,1797.0,6354.0,987.0,6232.0,1531.0,76.0,2658.0,1229.0,183.0,144.0,1268.0,716.0,310.0,3143.0,6608.0,358.0,8672.0,0.0,5388.0,16113.0,1556.0,17360.0,8406.0,45.0,3579.0,0.0,0.0,0.0,2988.0,112.0,0.0,2647.0,1189.0,1.0,11733.0,1937.0,0.0,1939.0,7999.0,1911.0,0.0,1644.0,2629.0,2732.0,12414.0,0.0,0.0,811.0,7253.0,3323.0,843.0,2887.0,5145.0,3379.0,41.0,8359.0,6.0,0.0,0.0,0.0,0.0,16959.0,12752.0,5977.0,185.0,2004.0,2053.0,9796.0,49.0,681.0,3160.0,18.0,2.0,2558.0,17.0,3131.0,1625.0,3638.0,886.0,6067.0,45.0,0.0,0.0,0.0,15510.0,4.0,3311.0,5433.0,0.0,3188.0,0.0,0.0,3.0,0.0,251.0,1181.0,7939.0,4560.0,6657.0,11797.0,451.0,724.0,0.0,0.0,135.0,3087.0,0.0,11.0,2441.0,19.0,1617.0,1567.0,175.0,2986.0,0.0,4508.0,0.0,3606.0,8672.0,6747.0,35072.0,7208.0,21943.0,11941.0,10099.0,1789.0,0.0,6180.0,937.0,27.0,0.0,618.0,12061.0,6157.0,0.0,2142.0,4294.0,11535.0,23912.0,10656.0,2385.0,845.0,29416.0,6218.0,1
3,2006.0,752.0,4465.0,811.0,3725.0,1308.0,5.0,0.0,2542.0,0.0,13.0,1853.0,568.0,0.0,1670.0,5533.0,857.0,4789.0,0.0,2404.0,12614.0,641.0,7575.0,2883.0,0.0,259.0,0.0,0.0,0.0,3022.0,50.0,0.0,542.0,516.0,1.0,3684.0,499.0,420.0,1353.0,4442.0,2020.0,0.0,680.0,519.0,1655.0,6370.0,0.0,0.0,18.0,1434.0,1515.0,537.0,2106.0,4752.0,3726.0,0.0,6002.0,4.0,0.0,1.0,0.0,0.0,14276.0,4852.0,3059.0,0.0,1573.0,2059.0,1816.0,86.0,0.0,790.0,30.0,2.0,956.0,0.0,1000.0,522.0,1022.0,849.0,3659.0,0.0,0.0,0.0,0.0,8200.0,1.0,2368.0,6364.0,0.0,44.0,6.0,0.0,46.0,10.0,7.0,358.0,2558.0,3916.0,3239.0,9135.0,406.0,889.0,0.0,31.0,26.0,2816.0,385.0,42.0,254.0,0.0,1550.0,401.0,481.0,846.0,1025.0,5966.0,0.0,5824.0,7995.0,1748.0,16124.0,4747.0,11414.0,5030.0,11686.0,1763.0,0.0,2181.0,709.0,0.0,69.0,580.0,8753.0,3144.0,0.0,1286.0,3461.0,6608.0,5638.0,505.0,2239.0,243.0,22824.0,3427.0,0
4,3686.0,2004.0,3557.0,1109.0,3994.0,2056.0,145.0,1045.0,2293.0,0.0,34.0,1406.0,640.0,0.0,2147.0,4889.0,345.0,5322.0,0.0,2490.0,10799.0,269.0,13924.0,4687.0,0.0,1178.0,19.0,112.0,0.0,2970.0,30.0,0.0,835.0,833.0,0.0,3418.0,882.0,195.0,929.0,6486.0,2230.0,0.0,2610.0,2325.0,2334.0,10013.0,0.0,0.0,1511.0,4493.0,2289.0,282.0,1797.0,3840.0,2053.0,4.0,3638.0,0.0,0.0,1.0,0.0,0.0,15418.0,6914.0,1699.0,25.0,397.0,2336.0,4546.0,47.0,216.0,1051.0,321.0,0.0,1397.0,1.0,2551.0,843.0,2499.0,1016.0,1734.0,0.0,0.0,0.0,0.0,10350.0,2.0,2800.0,6000.0,0.0,1181.0,502.0,0.0,84.0,3.0,66.0,575.0,4174.0,4912.0,2806.0,6836.0,319.0,66.0,0.0,0.0,80.0,2046.0,0.0,179.0,1150.0,132.0,748.0,3078.0,230.0,309.0,199.0,4429.0,0.0,1937.0,10870.0,3240.0,15743.0,3980.0,17134.0,6159.0,14095.0,793.0,0.0,4208.0,319.0,0.0,8.0,138.0,10705.0,3236.0,0.0,1765.0,3578.0,8464.0,7307.0,3870.0,4121.0,26.0,19707.0,5201.0,0


In [5]:
df2 = df.copy()

# target value
Y = df2.Sex.values
# features
X = df2.drop('Sex', axis = 1).values

In [6]:
from sklearn.model_selection import train_test_split
#Splitting the data.
split_test_size = 0.30

# Split the data into a training set and a test set
train_features, test_features, train_labels, test_labels = train_test_split(X, Y, test_size =  split_test_size, random_state = 42)

# Check whether the correct % data split (70/30 %)Train vs Test data.
print("{0:0.2f}% in training".format(  len(train_features)/len(df2.index) *100))
print("{0:0.2f}% in testing".format(  len(test_features)/len(df2.index) *100))


69.30% in training
30.70% in testing


## Training Model

### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score

#train the model.
model = LogisticRegression(C=0.01, solver='liblinear')  #.fit(X_train,y_train)
model.fit(train_features, train_labels.ravel())

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
# performance on test set
calculate_accuracy_confusion_matrix(model, test_features, test_labels)

0.7428571428571429
Confusion Metrices for Decision Tree
[[11  3]
 [ 6 15]]
Classification Report
              precision    recall  f1-score   support

           0       0.65      0.79      0.71        14
           1       0.83      0.71      0.77        21

    accuracy                           0.74        35
   macro avg       0.74      0.75      0.74        35
weighted avg       0.76      0.74      0.75        35



In [9]:
# save the model
save_model(model, 'LR')

### Decision Boundary

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

#train the model.
model = DecisionTreeClassifier(max_depth=9, random_state = 42, criterion='gini') # entropy
model.fit(train_features, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [11]:
# performace on test set
calculate_accuracy_confusion_matrix(model, test_features, test_labels)

0.5428571428571428
Confusion Metrices for Decision Tree
[[ 6  8]
 [ 8 13]]
Classification Report
              precision    recall  f1-score   support

           0       0.43      0.43      0.43        14
           1       0.62      0.62      0.62        21

    accuracy                           0.54        35
   macro avg       0.52      0.52      0.52        35
weighted avg       0.54      0.54      0.54        35



In [12]:
# save the model
save_model(model, 'DT')

### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=8, n_estimators=25,min_samples_split=10, random_state=0)

model.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [14]:
# performance on test set
calculate_accuracy_confusion_matrix(model, test_features, test_labels)

0.7714285714285715
Confusion Metrices for Decision Tree
[[ 8  6]
 [ 2 19]]
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.57      0.67        14
           1       0.76      0.90      0.83        21

    accuracy                           0.77        35
   macro avg       0.78      0.74      0.75        35
weighted avg       0.78      0.77      0.76        35



In [15]:
# save the model
save_model(model, 'RF')

### LightGBM

In [16]:
from lightgbm import LGBMClassifier

model=LGBMClassifier(n_estimators=300, learning_rate=0.01, num_leaves=80, max_depth = 9, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

model.fit(train_features, train_labels.ravel())

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.2,
               importance_type='split', learning_rate=0.01, max_depth=9,
               min_child_samples=20, min_child_weight=40, min_split_gain=0.01,
               n_estimators=300, n_jobs=-1, num_leaves=80, objective=None,
               random_state=None, reg_alpha=3, reg_lambda=1, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
# performance of the model on test data
calculate_accuracy_confusion_matrix(model, test_features, test_labels)

0.6
Confusion Metrices for Decision Tree
[[ 0 14]
 [ 0 21]]
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.60      1.00      0.75        21

    accuracy                           0.60        35
   macro avg       0.30      0.50      0.37        35
weighted avg       0.36      0.60      0.45        35



In [18]:
# save the model
save_model(model, 'LightGBM')

### XGBoost

In [19]:
import xgboost as xgb

model = xgb.XGBClassifier(booster="gbtree", max_depth=7, objective="binary:logistic", random_state=42, nthread=4, n_estimators = 20, eta = 0.05)

model.fit(train_features, train_labels.ravel())

ModuleNotFoundError: No module named 'xgboost'