In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
feat = pd.read_csv("Features.csv")
data = pd.read_excel('Labels.xlsx')
data.shape, feat.shape

((1533, 16), (1533, 7))

In [4]:
data.columns

Index(['Baseline', 'Letter Size', 'Line Spacing', 'Word Spacing', 'Top Margin',
       'Pen Pressure', 'Slant of Letters', 'Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation',
       'Image No'],
      dtype='object')

In [5]:
X = feat[['Baseline', 'Letter Size', 'Line Spacing', 'Word Spacing', 'Top Margin', 'Pen Pressure', 'Slant of Letters']]


In [6]:
y_cols = data[['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']]

In [7]:
X

Unnamed: 0,Baseline,Letter Size,Line Spacing,Word Spacing,Top Margin,Pen Pressure,Slant of Letters
0,-0.39,5.70,12.29,4.37,2.46,171.38,-15
1,0.05,3.07,17.29,2.89,1.74,194.71,-15
2,-1.10,2.02,16.36,3.14,1.65,170.29,-15
3,-0.01,1.91,15.73,3.32,1.60,165.56,180
4,0.00,2.10,15.70,3.25,1.69,171.55,-15
...,...,...,...,...,...,...,...
1528,0.21,2.13,12.18,4.81,1.66,151.69,0
1529,0.21,2.47,12.55,4.59,1.45,148.78,180
1530,-0.02,3.95,12.40,4.70,1.59,148.31,180
1531,0.41,4.07,12.30,4.59,1.38,153.32,0


In [8]:
y_dict = {}
for i, k in enumerate(['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']):
    print(k)
    y = y_cols[k]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    rf = RandomForestClassifier()
    rf.fit(X=X_train, y=y_train)
    y_pred = rf.predict(X_test)
    print("Accuracy score for ", end = " ")
    print(accuracy_score(y_true=y_test, y_pred=y_pred))
    print("Feature Importance", end = "")
    print(rf.feature_importances_)
    print("----------------------------------------")

Emotional Stability
Accuracy score for  0.996742671009772
Feature Importance[0.53723937 0.02057098 0.02076818 0.01791399 0.02185592 0.02366551
 0.35798605]
----------------------------------------
Mental Energy and Will Power
Accuracy score for  1.0
Feature Importance[0.02433474 0.04417044 0.24813378 0.17977016 0.04330592 0.44051604
 0.01976891]
----------------------------------------
Modesty
Accuracy score for  0.993485342019544
Feature Importance[0.01056982 0.78830954 0.0838893  0.06876473 0.02353931 0.01474742
 0.01017987]
----------------------------------------
Personal Harmony and Flexibility
Accuracy score for  1.0
Feature Importance[0.01555316 0.03672835 0.17945257 0.24568549 0.49011194 0.0229319
 0.00953658]
----------------------------------------
Lack of Discipline
Accuracy score for  1.0
Feature Importance[0.03918023 0.35461723 0.0490214  0.04253491 0.03650469 0.04762542
 0.43051612]
----------------------------------------
Poor Concentration
Accuracy score for  0.99674267

# Hyper-paramter tuning

In [35]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 100, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf = [x/10 for x in range(2, 10)]
# Method of selecting samples for training each tree
#bootstrap = [True]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'ccp_alpha': min_samples_leaf}
print(random_grid)

{'n_estimators': [50, 62, 75, 87, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50], 'min_samples_split': [2], 'ccp_alpha': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}


In [36]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 5, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'ccp_alpha': [0.2, 0.3, 0.4, 0.5, 0.6,
                                                      0.7, 0.8, 0.9],
                                        'max_depth': [10, 20, 30, 40, 50],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_split': [2],
                                        'n_estimators': [50, 62, 75, 87, 100]},
                   random_state=42, verbose=2)

In [37]:
y_pred = rf_random.predict(X_test)
rf_random.score(X_test,y_test)

0.9706840390879479

In [None]:
grid_best_params = {}
for i, k in enumerate(['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']):
    print(k)
    y = y_cols[k]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
    rf_random.fit(X_train, y_train)
    
    print(rf_random.best_params_)
    grid_best_params[k] = rf_random.best_params_
    
    print("----------------------------------------")

In [12]:
grid_best_params = {'Emotional Stability': {'n_estimators': 200,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 50,
  'bootstrap': True},
 'Mental Energy and Will Power': {'n_estimators': 400,
  'min_samples_split': 10,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 60,
  'bootstrap': False},
 'Modesty': {'n_estimators': 600,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 'auto',
  'max_depth': 60,
  'bootstrap': False},
 'Personal Harmony and Flexibility': {'n_estimators': 600,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 'auto',
  'max_depth': 60,
  'bootstrap': False},
 'Lack of Discipline': {'n_estimators': 400,
  'min_samples_split': 10,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 60,
  'bootstrap': False},
 'Poor Concentration': {'n_estimators': 200,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 50,
  'bootstrap': True},
 'Non Communicativeness': {'n_estimators': 200,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 50,
  'bootstrap': True},
 'Social Isolation': {'n_estimators': 600,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 'auto',
  'max_depth': 60,
  'bootstrap': False}}

In [13]:
for i, k in enumerate(['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']):
    print(k)
    y = y_cols[k]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(grid_best_params[k])
    rf_random = RandomForestClassifier(n_estimators =grid_best_params[k]['n_estimators'],
                                      min_samples_split  = grid_best_params[k]['min_samples_split'],
                                      min_samples_leaf = grid_best_params[k]['min_samples_leaf'],
                                      max_features = grid_best_params[k]['max_features'],
                                      max_depth = grid_best_params[k]['max_depth'],
                                      bootstrap = grid_best_params[k]['bootstrap'])
#     rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
    rf_random.fit(X_train, y_train)
    y_pred = rf_random.predict(X_test)
    
    print(accuracy_score(y_true=y_test, y_pred=y_pred))
    print("----------------------------------------")

Emotional Stability
{'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
1.0
----------------------------------------
Mental Energy and Will Power
{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}
1.0
----------------------------------------
Modesty
{'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 60, 'bootstrap': False}
0.993485342019544
----------------------------------------
Personal Harmony and Flexibility
{'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 60, 'bootstrap': False}
1.0
----------------------------------------
Lack of Discipline
{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}
1.0
----------------------------------------


In [None]:
grid_best_params

In [42]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

In [47]:
confusion_matrix(y_true = y_test, y_pred = y_pred)

array([1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,

In [44]:
recall_score(y_true = y_test, y_pred = y_pred)

0.9370629370629371

In [46]:
precision_score(y_true = y_test, y_pred = y_pred)

1.0