In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
feat = pd.read_csv("Features.csv")
data = pd.read_excel('Labels.xlsx')
data.shape, feat.shape

((1533, 16), (1533, 7))

In [3]:
data.columns

Index(['Baseline', 'Letter Size', 'Line Spacing', 'Word Spacing', 'Top Margin',
       'Pen Pressure', 'Slant of Letters', 'Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation',
       'Image No'],
      dtype='object')

In [4]:
X = feat[['Baseline', 'Letter Size', 'Line Spacing', 'Word Spacing', 'Top Margin', 'Pen Pressure', 'Slant of Letters']]


In [5]:
y_cols = data[['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']]

In [6]:
X

Unnamed: 0,Baseline,Letter Size,Line Spacing,Word Spacing,Top Margin,Pen Pressure,Slant of Letters
0,-0.39,5.70,12.29,4.37,2.46,171.38,-15
1,0.05,3.07,17.29,2.89,1.74,194.71,-15
2,-1.10,2.02,16.36,3.14,1.65,170.29,-15
3,-0.01,1.91,15.73,3.32,1.60,165.56,180
4,0.00,2.10,15.70,3.25,1.69,171.55,-15
...,...,...,...,...,...,...,...
1528,0.21,2.13,12.18,4.81,1.66,151.69,0
1529,0.21,2.47,12.55,4.59,1.45,148.78,180
1530,-0.02,3.95,12.40,4.70,1.59,148.31,180
1531,0.41,4.07,12.30,4.59,1.38,153.32,0


In [7]:
y_dict = {}
for i, k in enumerate(['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']):
    print(k)
    y = y_cols[k]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    rf = RandomForestClassifier()
    rf.fit(X=X_train, y=y_train)
    y_pred = rf.predict(X_test)
    print("Accuracy score for ", end = " ")
    print(accuracy_score(y_true=y_test, y_pred=y_pred))
    print("Feature Importance", end = "")
    print(rf.feature_importances_)
    print("----------------------------------------")

Emotional Stability
Accuracy score for  1.0
Feature Importance[0.52481575 0.02059175 0.01728708 0.01979695 0.01990064 0.02174625
 0.37586158]
----------------------------------------
Mental Energy and Will Power
Accuracy score for  1.0
Feature Importance[0.02088829 0.039459   0.2340701  0.1441106  0.0463338  0.49553071
 0.0196075 ]
----------------------------------------
Modesty
Accuracy score for  1.0
Feature Importance[0.02011285 0.75191489 0.09984167 0.06830703 0.02970351 0.02007646
 0.0100436 ]
----------------------------------------
Personal Harmony and Flexibility
Accuracy score for  0.990228013029316
Feature Importance[0.01658886 0.03539093 0.13907379 0.27485186 0.4946191  0.02902158
 0.01045389]
----------------------------------------
Lack of Discipline
Accuracy score for  0.996742671009772
Feature Importance[0.04420535 0.32854521 0.05272999 0.03921223 0.03227403 0.04145696
 0.46157624]
----------------------------------------
Poor Concentration
Accuracy score for  1.0
Featu

# Hyper-paramter tuning

In [8]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
#bootstrap = [True]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [10]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [11]:
y_pred = rf_random.predict(X_test)
rf_random.score(y_pred,y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1.
 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0.
 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0.
 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1.
 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0.
 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1.
 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [23]:
grid_best_params = {}
for i, k in enumerate(['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']):
    print(k)
    y = y_cols[k]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
    rf_random.fit(X_train, y_train)
    
    print(rf_random.best_params_)
    grid_best_params[k] = rf_random.best_params_
    
    print("----------------------------------------")

Emotional Stability
Fitting 3 folds for each of 10 candidates, totalling 30 fits
{'n_estimators': 1200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 70}
----------------------------------------
Mental Energy and Will Power
Fitting 3 folds for each of 10 candidates, totalling 30 fits
{'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 70}
----------------------------------------
Modesty
Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
grid_best_params = {'Emotional Stability': {'n_estimators': 200,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 50,
  'bootstrap': True},
 'Mental Energy and Will Power': {'n_estimators': 400,
  'min_samples_split': 10,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 60,
  'bootstrap': False},
 'Modesty': {'n_estimators': 600,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 'auto',
  'max_depth': 60,
  'bootstrap': False},
 'Personal Harmony and Flexibility': {'n_estimators': 600,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 'auto',
  'max_depth': 60,
  'bootstrap': False},
 'Lack of Discipline': {'n_estimators': 400,
  'min_samples_split': 10,
  'min_samples_leaf': 1,
  'max_features': 'sqrt',
  'max_depth': 60,
  'bootstrap': False},
 'Poor Concentration': {'n_estimators': 200,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 50,
  'bootstrap': True},
 'Non Communicativeness': {'n_estimators': 200,
  'min_samples_split': 10,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 50,
  'bootstrap': True},
 'Social Isolation': {'n_estimators': 600,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 'auto',
  'max_depth': 60,
  'bootstrap': False}}

In [None]:
for i, k in enumerate(['Emotional Stability',
       'Mental Energy and Will Power', 'Modesty',
       'Personal Harmony and Flexibility', 'Lack of Discipline',
       'Poor Concentration', 'Non Communicativeness', 'Social Isolation']):
    print(k)
    y = y_cols[k]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(grid_best_params[k])
    rf_random = RandomForestClassifier(n_estimators =grid_best_params[k]['n_estimators'],
                                      min_samples_split  = grid_best_params[k]['min_samples_split'],
                                      min_samples_leaf = grid_best_params[k]['min_samples_leaf'],
                                      max_features = grid_best_params[k]['max_features'],
                                      max_depth = grid_best_params[k]['max_depth'],
                                      bootstrap = grid_best_params[k]['bootstrap'])
#     rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
    rf_random.fit(X_train, y_train)
    y_pred = rf_random.predict(X_test)
    
    print(accuracy_score(y_true=y_test, y_pred=y_pred))
    print("----------------------------------------")

In [57]:
grid_best_params

{}