In [11]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [12]:
dataset = pd.read_csv('personality_dataset.csv')
print(dataset.head())

   Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0               4.0         No                      4.0            6.0   
1               9.0        Yes                      0.0            0.0   
2               9.0        Yes                      1.0            2.0   
3               0.0         No                      6.0            7.0   
4               3.0         No                      9.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                        No                 13.0             5.0   Extrovert  
1                       Yes                  0.0             3.0   Introvert  
2                       Yes                  5.0             2.0   Introvert  
3                        No                 14.0             8.0   Extrovert  
4                        No                  8.0             5.0   Extrovert  


In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2837 non-null   float64
 1   Stage_fear                 2827 non-null   object 
 2   Social_event_attendance    2838 non-null   float64
 3   Going_outside              2834 non-null   float64
 4   Drained_after_socializing  2848 non-null   object 
 5   Friends_circle_size        2823 non-null   float64
 6   Post_frequency             2835 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB


In [14]:
dataset.empty

False

In [15]:
#menunjukkan jumlah missing value per kolom
dataset.isnull().sum()

Time_spent_Alone             63
Stage_fear                   73
Social_event_attendance      62
Going_outside                66
Drained_after_socializing    52
Friends_circle_size          77
Post_frequency               65
Personality                   0
dtype: int64

In [16]:
#tangani missing value
dataset = dataset.dropna()

In [17]:
dataset.isnull().sum()

Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64

In [18]:
en = LabelEncoder()

dataset['Personality'] = en.fit_transform(dataset['Personality'])
dataset

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,0
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,1
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,1
3,0.0,No,6.0,7.0,No,14.0,8.0,0
4,3.0,No,9.0,4.0,No,8.0,5.0,0
...,...,...,...,...,...,...,...,...
2892,9.0,Yes,2.0,0.0,Yes,1.0,2.0,1
2895,3.0,No,7.0,6.0,No,6.0,6.0,0
2896,3.0,No,8.0,3.0,No,14.0,9.0,0
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,1


In [19]:
en = LabelEncoder()

dataset['Stage_fear'] = en.fit_transform(dataset['Stage_fear'])
dataset

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,No,13.0,5.0,0
1,9.0,1,0.0,0.0,Yes,0.0,3.0,1
2,9.0,1,1.0,2.0,Yes,5.0,2.0,1
3,0.0,0,6.0,7.0,No,14.0,8.0,0
4,3.0,0,9.0,4.0,No,8.0,5.0,0
...,...,...,...,...,...,...,...,...
2892,9.0,1,2.0,0.0,Yes,1.0,2.0,1
2895,3.0,0,7.0,6.0,No,6.0,6.0,0
2896,3.0,0,8.0,3.0,No,14.0,9.0,0
2897,4.0,1,1.0,1.0,Yes,4.0,0.0,1


In [20]:
en = LabelEncoder()

dataset['Drained_after_socializing'] = en.fit_transform(dataset['Drained_after_socializing'])
dataset

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,0
1,9.0,1,0.0,0.0,1,0.0,3.0,1
2,9.0,1,1.0,2.0,1,5.0,2.0,1
3,0.0,0,6.0,7.0,0,14.0,8.0,0
4,3.0,0,9.0,4.0,0,8.0,5.0,0
...,...,...,...,...,...,...,...,...
2892,9.0,1,2.0,0.0,1,1.0,2.0,1
2895,3.0,0,7.0,6.0,0,6.0,6.0,0
2896,3.0,0,8.0,3.0,0,14.0,9.0,0
2897,4.0,1,1.0,1.0,1,4.0,0.0,1


In [21]:
dataset.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,0
1,9.0,1,0.0,0.0,1,0.0,3.0,1
2,9.0,1,1.0,2.0,1,5.0,2.0,1
3,0.0,0,6.0,7.0,0,14.0,8.0,0
4,3.0,0,9.0,4.0,0,8.0,5.0,0


In [22]:
x = dataset.iloc[:, :-1].values #smua kecuali terahkir
y = dataset.iloc[:, -1].values #kolom trahkir

In [23]:
x

array([[ 4.,  0.,  4., ...,  0., 13.,  5.],
       [ 9.,  1.,  0., ...,  1.,  0.,  3.],
       [ 9.,  1.,  1., ...,  1.,  5.,  2.],
       ...,
       [ 3.,  0.,  8., ...,  0., 14.,  9.],
       [ 4.,  1.,  1., ...,  1.,  4.,  0.],
       [ 3.,  0.,  6., ...,  0.,  6.,  9.]], shape=(2477, 7))

In [24]:
y

array([0, 1, 1, ..., 0, 1, 0], shape=(2477,))

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=125)

print("x_train = ", len(x_train))
print("x_test = ", len(x_test))
print("y_train = ", len(y_train))
print("y_test = ", len(y_test))

x_train =  1981
x_test =  496
y_train =  1981
y_test =  496


In [26]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [27]:
y_pred = classifier.predict(x_test)
y_pred

array([1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,

In [28]:
classifier.predict_proba(x_test)

array([[1.04687385e-08, 9.99999990e-01],
       [1.00000000e+00, 3.84132640e-17],
       [1.00000000e+00, 5.81503967e-16],
       [1.84020787e-11, 1.00000000e+00],
       [1.00000000e+00, 4.17095736e-15],
       [5.48983317e-10, 9.99999999e-01],
       [2.09403713e-11, 1.00000000e+00],
       [1.00000000e+00, 9.67345602e-18],
       [1.00000000e+00, 8.27105238e-16],
       [1.00000000e+00, 2.26323904e-14],
       [1.00000000e+00, 1.19197392e-12],
       [1.00000000e+00, 2.56084581e-12],
       [6.04865656e-09, 9.99999994e-01],
       [2.43860747e-09, 9.99999998e-01],
       [1.00000000e+00, 4.91367273e-16],
       [1.00000000e+00, 1.85430243e-10],
       [1.00000000e+00, 1.43541199e-17],
       [1.00000000e+00, 1.26455877e-12],
       [6.83934427e-10, 9.99999999e-01],
       [9.48718278e-11, 1.00000000e+00],
       [1.00000000e+00, 8.88762684e-18],
       [3.23105973e-10, 1.00000000e+00],
       [1.00000000e+00, 3.63689812e-20],
       [2.39450938e-10, 1.00000000e+00],
       [3.746095

In [29]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[232  19]
 [ 21 224]]


In [30]:
akurasi = classification_report(y_test, y_pred)
print(akurasi)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       251
           1       0.92      0.91      0.92       245

    accuracy                           0.92       496
   macro avg       0.92      0.92      0.92       496
weighted avg       0.92      0.92      0.92       496



In [31]:
akurasi = accuracy_score(y_test, y_pred)
print('Tingkat AKURASI : %d persen' %(akurasi*100))

Tingkat AKURASI : 91 persen


In [32]:
ydata = pd.DataFrame()
ydata['y_test'] = pd.DataFrame(y_test)
ydata['y_pred'] = pd.DataFrame(y_pred)
ydata

Unnamed: 0,y_test,y_pred
0,1,1
1,0,0
2,0,0
3,1,1
4,0,0
...,...,...
491,0,0
492,1,1
493,1,1
494,0,0
