In [53]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [60]:
dataset = pd.read_csv("improved_personality_traits_datection_dataset.csv")
dataset.drop(dataset.columns[0], axis = 1, inplace = True) 
dataset

Unnamed: 0,Review,Extroversion,Neuroticism,Agreeableness,Conscientiousness,Openness
0,"""Well, right now I just woke up from a mid-day...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,"""An open keyboard and buttons to push. The thi...",0,1,0,1,1
3,I can't believe it! It's really happening! M...,1,0,1,1,0
4,"""Well, here I go with the good old stream of c...",1,0,1,0,1
...,...,...,...,...,...,...
2447,I'm home. wanted to go to bed but remembe...,0,1,0,1,0
2448,Stream of consiousnesssskdj. How do you s...,1,1,0,0,1
2449,"It is Wednesday, December 8th and a lot has be...",0,0,1,0,0
2450,"""Man this week has been hellish. Anyways, now ...",0,1,0,0,1


In [61]:
dataset['Review']

0       "Well, right now I just woke up from a mid-day...
1       Well, here we go with the stream of consciousn...
2       "An open keyboard and buttons to push. The thi...
3       I can't believe it!  It's really happening!  M...
4       "Well, here I go with the good old stream of c...
                              ...                        
2447         I'm home. wanted to go to bed but remembe...
2448         Stream of consiousnesssskdj. How do you s...
2449    It is Wednesday, December 8th and a lot has be...
2450    "Man this week has been hellish. Anyways, now ...
2451    I have just gotten off the phone with brady. I...
Name: Review, Length: 2452, dtype: object

In [62]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /home/hritik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
corpus = [] #cleaned reviews

for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [64]:
len(corpus)

2452

In [65]:
corpus[0]

'english second languag probabl grammar mistak phrase make sens watch realli good movi one movi realli keep think anyway want talk know talk lot read chose lot read subject abl handl matter organ correctli time busi better academ result alway like lot thing organ time better sit front tv practic noth love though prefer busi lot respons semest take hour per week also student organ belli danc class intramur soccer besid adapt new univers freshman idea like task seem pretti pointless useless like write whatev french would best defin n import quoi love franc realli miss countri think ever live know strang feel stay us realli want know end find tempt work marri guy well know sixth sens life give mani turn life like box chocol never know get heheheh forrest gump wonder tom hank right last movi coupl year ago new one retir dedic profit career left mother might think lot know miss time realli happi know follow dream florent abl dump someth extrem difficult use weekli call although love like so

In [66]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer #to create bag of words sparse matrix
cv = CountVectorizer(max_features = 1500)

X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
y=y.astype('int') # coz  y was object, so sklearn can not recognize it's type, so converting to int

#we have not done dimentionality reduction, but still we are ignoring that

In [67]:
# source
print(X) #a sparse matrix (2d array)(2452 x 1500)
#1 column for each word
#1 row for each review

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]]


In [68]:
print(y)

[0 0 0 ... 0 0 0]


In [69]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

#dataset is small, hence we are not expecting so much accuracy

In [70]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [71]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
print(y_pred)

[1 1 0 1 0 1 0 1 1 0 0 0 0 1 0 1 0 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 1 0 1 0
 1 1 0 0 1 0 0 1 0 1 0 0 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 1 0 1 0 0 1 0 1 1 0
 0 1 0 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 1 1 0 0 0
 1 1 0 0 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0
 0 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 0 0 1 0 1
 0 0 1 0 1 1 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 0 1 0 1
 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1
 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 0 1 1 0 1
 0 0 0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 0 0 1
 1 0 1 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 1 1
 1 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0 0 1 0 1 1 1 1 1
 1 1 1 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0
 1 0 0 0 1 1 0 0 0 0]


In [72]:
len(y_pred)

491

In [73]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[ 99 128]
 [115 149]]


In [74]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

#testing accuracy

0.505091649694501

In [78]:
type(classifier)

sklearn.naive_bayes.GaussianNB