In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
# Load data
df = pd.read_csv("personality_datasert.csv")
df

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,3.0,Yes,2.0,0.0,Introvert


In [4]:
# encoding
df['Stage_fear'] = df['Stage_fear'].map({"Yes":1, "No":0})
df['Drained_after_socializing'] = df['Drained_after_socializing'].map({"Yes":1, "No":0})

In [5]:
#
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time_spent_Alone,2900.0,4.505816,3.44118,0.0,2.0,4.0,7.0,11.0
Stage_fear,2900.0,0.486207,0.499896,0.0,0.0,0.0,1.0,1.0
Social_event_attendance,2900.0,3.963354,2.872608,0.0,2.0,3.963354,6.0,10.0
Going_outside,2900.0,3.0,2.221597,0.0,1.0,3.0,5.0,7.0
Drained_after_socializing,2900.0,0.485172,0.499866,0.0,0.0,0.0,1.0,1.0
Friends_circle_size,2900.0,6.268863,4.23234,0.0,3.0,5.0,10.0,15.0
Post_frequency,2900.0,3.564727,2.893587,0.0,1.0,3.0,6.0,10.0


In [6]:
# Check for missing values
df.isnull().value_counts()

Time_spent_Alone  Stage_fear  Social_event_attendance  Going_outside  Drained_after_socializing  Friends_circle_size  Post_frequency  Personality
False             False       False                    False          False                      False                False           False          2900
Name: count, dtype: int64

In [7]:
# Setting X and Y, X without "Drained_after_sociolizing", and "Post_frequency"
X_1 =df[["Time_spent_Alone","Stage_fear", "Social_event_attendance", "Going_outside", "Drained_after_socializing", "Friends_circle_size", "Post_frequency"]]
Y = df["Personality"]

In [8]:
# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X_1, Y, test_size=.3, random_state=42)

In [9]:
X_train

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
654,1.0,0,5.0,7.0,0,11.0,8.0
1999,5.0,1,1.0,0.0,1,5.0,1.0
1990,11.0,1,1.0,0.0,1,4.0,0.0
1187,9.0,1,2.0,0.0,1,4.0,1.0
821,3.0,0,4.0,7.0,0,8.0,5.0
...,...,...,...,...,...,...,...
1638,2.0,0,7.0,4.0,0,12.0,8.0
1095,0.0,0,7.0,4.0,0,14.0,6.0
1130,11.0,1,1.0,1.0,1,4.0,2.0
1294,10.0,1,3.0,1.0,1,5.0,0.0


In [10]:
josue =[
    [6.0, 0, 2.0, 3.0, 0, 4.0, 1.0]]
Josue = pd.DataFrame(josue, columns=["Time_spent_Alone","Stage_fear", "Social_event_attendance", "Going_outside", "Drained_after_socializing", "Friends_circle_size", "Post_frequency"])
Josue

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,6.0,0,2.0,3.0,0,4.0,1.0


In [11]:
# Fit First model
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train, Y_train)

In [12]:
# Fit Second Model

etc = tree.ExtraTreeClassifier()
etc = etc.fit(X_train, Y_train)

In [13]:
# models prediction and accurcy

dtc_pred = dtc.predict(X_test)
dtc_acc = accuracy_score(dtc_pred, Y_test)

etc_pred = etc.predict(X_test)
etc_acc = accuracy_score(etc_pred, Y_test)
print(f"Accuracy Scores:\nDecision Tree Classifier: {dtc_acc *100}\nExtra Tree Classifier: {etc_acc *100}")

Accuracy Scores:
Decision Tree Classifier: 85.74712643678161
Extra Tree Classifier: 87.93103448275862


In [14]:
# Extra Tree Classifier's Confusion Matrix
etc_con = confusion_matrix(etc_pred, Y_test)
print(etc_con)

[[410  52]
 [ 53 355]]


In [15]:
# Extra Tree Classifier's Classification Report
etc_cla = classification_report(etc_pred, Y_test)
print(etc_cla)

              precision    recall  f1-score   support

   Extrovert       0.89      0.89      0.89       462
   Introvert       0.87      0.87      0.87       408

    accuracy                           0.88       870
   macro avg       0.88      0.88      0.88       870
weighted avg       0.88      0.88      0.88       870



In [16]:
# with GradientBoostingClassifier, RandomForrest, ExtraTrees
gbc = GradientBoostingClassifier(n_estimators=50,max_depth=4, learning_rate=0.1, random_state=42)
gbc = gbc.fit(X_train, Y_train)
gbc_pred = gbc.predict(X_test)
gbc_acc = accuracy_score(Y_test, gbc_pred)

rfc = RandomForestClassifier(n_estimators=50,max_depth=4, random_state=42)
rfc = rfc.fit(X_train, Y_train)
rfc_pred = rfc.predict(X_test)
rfc_acc = accuracy_score(Y_test, rfc_pred)

eTsc = ExtraTreesClassifier(n_estimators=50,max_depth=4, random_state=42)
eTsc = eTsc.fit(X_train, Y_train)
eTsc_pred = eTsc.predict(X_test)
eTsc_acc = accuracy_score(Y_test, eTsc_pred)

hgbc = HistGradientBoostingClassifier(max_depth=4, random_state=42)
hgbc = hgbc.fit(X_train, Y_train)
hgbc_pred = hgbc.predict(X_test)
hgbc_acc = accuracy_score(Y_test, hgbc_pred)
print(f"Accuracy Scores:\nGradient Boost Classifier: {gbc_acc*100}\nRandom Forrest Classifier: {rfc_acc*100}\nExtra Trees Classifier: {eTsc_acc*100}\nHistGradientBoost Tree: {hgbc_acc*100}")

Accuracy Scores:
Gradient Boost Classifier: 92.64367816091954
Random Forrest Classifier: 92.64367816091954
Extra Trees Classifier: 92.64367816091954
HistGradientBoost Tree: 92.52873563218391


In [17]:
# Classification report for Gradient Boost Classifier

gbc_cla = classification_report(Y_test, gbc_pred)
print(gbc_cla)

              precision    recall  f1-score   support

   Extrovert       0.94      0.92      0.93       463
   Introvert       0.91      0.93      0.92       407

    accuracy                           0.93       870
   macro avg       0.93      0.93      0.93       870
weighted avg       0.93      0.93      0.93       870



In [18]:
josue_syvelsaint = gbc.predict(Josue)
print(f"Josue is a/an {josue_syvelsaint[0]}")

Josue is a/an Introvert
