In [43]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import joblib
import pickle

In [77]:
df_train = pd.read_csv('Train.csv')

In [78]:
train_length = len(df_train)
df_test = pd.read_csv("Test.csv")
df_test

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality (class label)
0,Female,20,7,9,9,5,5,dependable
1,Male,17,5,4,5,2,4,serious
2,Female,25,5,5,7,2,4,serious
3,Female,18,6,2,7,4,7,serious
4,Female,19,2,4,7,1,3,responsible
...,...,...,...,...,...,...,...,...
310,Female,19,6,5,6,4,3,extraverted
311,Male,18,2,5,8,3,7,dependable
312,Male,18,7,5,6,2,7,serious
313,Male,23,6,7,5,4,3,extraverted


In [83]:
test_length = len(df_test)
df_train.rename(columns = {'Personality (Class label)':'Personality'}, inplace = True) 
df_test.rename(columns = {'Personality (class label)':'Personality'}, inplace = True)
#inpace = True indicates that the dataframe is updated but it will not return anything
# df.rename(columns = {'Personality (Class label)':'Personality'}, inplace = False)
df_train

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality
0,Male,17,7,4,7,3,2,extraverted
1,Male,19,4,5,4,6,6,serious
2,Female,18,7,6,4,5,5,dependable
3,Female,22,5,6,7,4,3,extraverted
4,Female,19,7,4,6,5,4,lively
...,...,...,...,...,...,...,...,...
704,Female,20,4,5,6,6,1,responsible
705,Male,18,6,3,1,5,5,dependable
706,Male,22,5,2,3,6,1,serious
707,Male,19,5,6,5,7,5,extraverted


In [84]:
df = pd.concat([df_train,df_test])
df

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality
0,Male,17,7,4,7,3,2,extraverted
1,Male,19,4,5,4,6,6,serious
2,Female,18,7,6,4,5,5,dependable
3,Female,22,5,6,7,4,3,extraverted
4,Female,19,7,4,6,5,4,lively
...,...,...,...,...,...,...,...,...
310,Female,19,6,5,6,4,3,extraverted
311,Male,18,2,5,8,3,7,dependable
312,Male,18,7,5,6,2,7,serious
313,Male,23,6,7,5,4,3,extraverted


In [85]:
df.head(10)

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality
0,Male,17,7,4,7,3,2,extraverted
1,Male,19,4,5,4,6,6,serious
2,Female,18,7,6,4,5,5,dependable
3,Female,22,5,6,7,4,3,extraverted
4,Female,19,7,4,6,5,4,lively
5,Male,18,5,7,7,6,4,lively
6,Female,17,5,6,5,7,4,extraverted
7,Female,19,6,6,7,5,4,extraverted
8,Male,18,5,7,5,6,7,dependable
9,Female,19,5,5,7,4,5,lively


In [86]:
# Label Encoding converts labels into numeric form so as to convert them into machine readable format.
label = LabelEncoder() # Targets values between 0 to n_classes-1
df['Gender'] = label.fit_transform(df['Gender'])
df.head(10) 

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality
0,2,17,7,4,7,3,2,extraverted
1,2,19,4,5,4,6,6,serious
2,1,18,7,6,4,5,5,dependable
3,1,22,5,6,7,4,3,extraverted
4,1,19,7,4,6,5,4,lively
5,2,18,5,7,7,6,4,lively
6,1,17,5,6,5,7,4,extraverted
7,1,19,6,6,7,5,4,extraverted
8,2,18,5,7,5,6,7,dependable
9,1,19,5,5,7,4,5,lively


In [87]:
# Standard Scaler standaradizes the a feature by subtracting the mean and then scaling to unit variance
# Unit variance means dividing all values by standard deviation
scaler = StandardScaler(with_std=1)
#default it takes std = 1 and mean = 0 when with_std and with_mean are False
input_columns = ['Gender', 'Age', 'openness', 'neuroticism','conscientiousness', 'agreeableness','extraversion']
output_columns = ['Personality']
df[input_columns] = scaler.fit_transform(df[input_columns])

In [88]:
df.head()

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion,Personality
0,1.048042,-1.118659,1.332469,-0.367643,1.332579,-1.105651,-1.652627,extraverted
1,1.048042,-0.335674,-0.634539,0.217841,-0.402032,0.703917,0.921198,serious
2,-0.946744,-0.727166,1.332469,0.803325,-0.402032,0.100728,0.277742,dependable
3,-0.946744,0.838803,0.02113,0.803325,1.332579,-0.502461,-1.009171,extraverted
4,-0.946744,-0.335674,1.332469,-0.367643,0.754375,0.100728,-0.365714,lively


In [89]:
df_train = df[:train_length]
df_test = df[train_length:]
X = df_train[input_columns]
Y = df_train[output_columns]

In [90]:
X

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion
0,1.048042,-1.118659,1.332469,-0.367643,1.332579,-1.105651,-1.652627
1,1.048042,-0.335674,-0.634539,0.217841,-0.402032,0.703917,0.921198
2,-0.946744,-0.727166,1.332469,0.803325,-0.402032,0.100728,0.277742
3,-0.946744,0.838803,0.021130,0.803325,1.332579,-0.502461,-1.009171
4,-0.946744,-0.335674,1.332469,-0.367643,0.754375,0.100728,-0.365714
...,...,...,...,...,...,...,...
704,-0.946744,0.055818,-0.634539,0.217841,0.754375,0.703917,-2.296083
705,1.048042,-0.727166,0.676799,-0.953126,-2.136644,0.100728,0.277742
706,1.048042,0.838803,0.021130,-1.538610,-0.980236,0.703917,-2.296083
707,1.048042,-0.335674,0.021130,0.803325,0.176171,1.307107,0.277742


In [91]:
Y

Unnamed: 0,Personality
0,extraverted
1,serious
2,dependable
3,extraverted
4,lively
...,...
704,responsible
705,dependable
706,serious
707,extraverted


In [108]:
# Assumption : We assume that data is identically distributed [Check]
df_train['Personality'].value_counts()

serious        161
extraverted    150
dependable     138
lively         134
responsible    126
Name: Personality, dtype: int64

In [109]:
df_test['Personality'].value_counts()

serious        153
extraverted     77
responsible     40
lively          24
dependable      21
Name: Personality, dtype: int64

In [100]:
model = LogisticRegression(multi_class='multinomial',solver='newton-cg',max_iter=10000)
model.fit(X,Y)

  y = column_or_1d(y, warn=True)


LogisticRegression(max_iter=10000, multi_class='multinomial',
                   solver='newton-cg')

In [101]:
X_test = df_test[input_columns]
Y_test = df_test[output_columns]

In [102]:
X_test

Unnamed: 0,Gender,Age,openness,neuroticism,conscientiousness,agreeableness,extraversion
0,-0.946744,0.055818,1.332469,2.559776,2.488986,0.100728,0.277742
1,1.048042,-1.118659,0.021130,-0.367643,0.176171,-1.708840,-0.365714
2,-0.946744,2.013280,0.021130,0.217841,1.332579,-1.708840,-0.365714
3,-0.946744,-0.727166,0.676799,-1.538610,1.332579,-0.502461,1.564654
4,-0.946744,-0.335674,-1.945878,-0.367643,1.332579,-2.312029,-1.009171
...,...,...,...,...,...,...,...
310,-0.946744,-0.335674,0.676799,0.217841,0.754375,-0.502461,-1.009171
311,1.048042,-0.727166,-1.945878,0.217841,1.910783,-1.105651,1.564654
312,1.048042,-0.727166,1.332469,0.217841,0.754375,-1.708840,1.564654
313,1.048042,1.230295,0.676799,1.388809,0.176171,-0.502461,-1.009171


In [103]:
Y_test

Unnamed: 0,Personality
0,dependable
1,serious
2,serious
3,serious
4,responsible
...,...
310,extraverted
311,dependable
312,serious
313,extraverted


In [104]:
Predicted_class = model.predict(X_test)
len(Predicted_class)

315

In [105]:
Predicted_class

array(['dependable', 'responsible', 'serious', 'serious', 'responsible',
       'serious', 'serious', 'serious', 'serious', 'serious', 'serious',
       'responsible', 'responsible', 'serious', 'lively', 'extraverted',
       'serious', 'serious', 'serious', 'extraverted', 'serious',
       'extraverted', 'serious', 'extraverted', 'serious', 'serious',
       'extraverted', 'extraverted', 'serious', 'responsible', 'serious',
       'lively', 'extraverted', 'lively', 'serious', 'serious', 'serious',
       'serious', 'extraverted', 'serious', 'responsible', 'responsible',
       'serious', 'lively', 'serious', 'serious', 'responsible',
       'dependable', 'serious', 'extraverted', 'serious', 'serious',
       'serious', 'lively', 'extraverted', 'serious', 'extraverted',
       'serious', 'serious', 'serious', 'responsible', 'serious',
       'dependable', 'serious', 'serious', 'responsible', 'serious',
       'serious', 'serious', 'serious', 'serious', 'serious', 'serious',
       'ser

In [106]:
print("Accuracy: ",metrics.accuracy_score(Y_test, Predicted_class)*100)
print("Confusion Matrix: ")
print(metrics.confusion_matrix(Y_test, Predicted_class))
print("Classification Report: ")
print(metrics.classification_report(Y_test,Predicted_class))
# print("Precision: ",metrics.precision_score(Y_test, Predicted_class))
# print("Recall: ",metrics.recall_score(Y_test, Predicted_class))
# print("F - Measure: ",metrics.f1_score(Y_test,Predicted_class))

Accuracy:  84.12698412698413
Confusion Matrix: 
[[  7   4   1   0   9]
 [  0  70   1   0   6]
 [  0   3  18   1   2]
 [  2   4   0  26   8]
 [  1   6   1   1 144]]
Classification Report: 
              precision    recall  f1-score   support

  dependable       0.70      0.33      0.45        21
 extraverted       0.80      0.91      0.85        77
      lively       0.86      0.75      0.80        24
 responsible       0.93      0.65      0.76        40
     serious       0.85      0.94      0.89       153

    accuracy                           0.84       315
   macro avg       0.83      0.72      0.75       315
weighted avg       0.84      0.84      0.83       315



In [107]:
# Linking model to application
joblib.dump(model, 'Personality Prediction Model.pkl')

['Personality Prediction Model.pkl']