In [50]:
!pip install liac-arff
!pip install scikit-plot



In [51]:
# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import requests
import arff

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

# load the arff file
movie_arff = arff.load(open('6-Movie_reviews-sentiments.arff'))

# get the attribute names
col_val = [attribute[0] for attribute in movie_arff['attributes']]

# create a pandas dataframe based on data and attribute names
movie_df = pd.DataFrame(movie_arff['data'], columns = col_val)

# Separate each of the reviews into a list
text_data = movie_df['text'].tolist()

# Create the CountVectorizer object
# CountVectorizer is used to convert a collection of text documents to a vector of term/token counts.
vectorizer = CountVectorizer(binary=True,max_features=1000)

# Fit the vectorizer to the text data
vectorizer.fit(text_data)

# Transform the text data to a bag of words representation
bag_of_words = vectorizer.transform(text_data)

# Print the bag of words representation
# Each row is a word, and each column is a review, if the word is in the review, the value is 1, otherwise 0
print(bag_of_words.toarray())
# Print an array of all the words
vectorizer.get_feature_names_out()

[[1 0 0 ... 1 0 1]
 [0 0 0 ... 1 0 1]
 [0 0 0 ... 1 1 0]
 ...
 [1 1 0 ... 1 1 1]
 [1 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 1]]


array(['10', 'ability', 'able', 'about', 'above', 'absolutely', 'across',
       'act', 'acting', 'action', 'actor', 'actors', 'actress', 'actual',
       'actually', 'add', 'after', 'again', 'against', 'age', 'agent',
       'ago', 'air', 'alien', 'all', 'almost', 'alone', 'along',
       'already', 'also', 'although', 'always', 'am', 'amazing',
       'america', 'american', 'among', 'amount', 'amusing', 'an', 'and',
       'annoying', 'another', 'any', 'anyone', 'anything', 'anyway',
       'apparently', 'appear', 'appearance', 'appears', 'are', 'aren',
       'around', 'art', 'as', 'ask', 'asks', 'aspect', 'at', 'atmosphere',
       'attempt', 'attempts', 'attention', 'audience', 'audiences',
       'away', 'awful', 'back', 'background', 'bad', 'based', 'basically',
       'battle', 'be', 'beautiful', 'because', 'become', 'becomes',
       'been', 'before', 'begin', 'beginning', 'begins', 'behind',
       'being', 'believable', 'believe', 'best', 'better', 'between',
       'beyond'

Feature (word) ranking

In [53]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Each row is a word, and each column is a review, if the word is in the review, the value is 1, otherwise 0
X = bag_of_words
# The output column (positive or negative)
y = movie_df.iloc[:,1]

# Use mutual information to rank the features
selector = SelectKBest(mutual_info_classif)
selector.fit(X, y)
scores = selector.scores_

# Get the indices of the selected features
selected_features_indices = selector.get_support(indices=True)

# Get an array of all the words
feature_names=vectorizer.get_feature_names_out()

# Create a dictionary that maps feature names to their scores
score_dict = dict(zip(feature_names, scores))

# Sort the dictionary by scores in descending order
sorted_dict = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
sorted_dict
i=0
for x in sorted_dict:
    i=i+1
    if i<40:
      print (i,x[0],x[1])
    else:
      break

1 bad 0.03477498144284996
2 worst 0.028714979914038874
3 stupid 0.020421896579034572
4 boring 0.019927310871732898
5 ridiculous 0.017064387259379935
6 waste 0.017064387259379935
7 awful 0.016837018054957242
8 mess 0.014520313687811828
9 perfect 0.013126721531831503
10 life 0.012934632927101675
11 supposed 0.012921570094156176
12 memorable 0.012120360451981053
13 dull 0.011714396975496873
14 excellent 0.010943278387290645
15 perfectly 0.010564821442847538
16 both 0.010555800671642135
17 script 0.010544679447932359
18 plot 0.01023763627065799
19 subtle 0.010237251853571668
20 performances 0.010157821863555544
21 terrible 0.010002403823255318
22 effective 0.009912354410119187
23 wonderful 0.00989379981573578
24 also 0.009462542466734059
25 true 0.009219236722203428
26 world 0.008872061750723137
27 hilarious 0.008776276815859171
28 great 0.008637316079904839
29 brilliant 0.008415205149675434
30 nothing 0.00839483924287232
31 others 0.00837362669978342
32 worse 0.008343364360560462
33 unfor

Classification Tree with Naive Bayes Classifier

In [54]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5).fit(X, y)
predicted = model.predict(X)
accuracy = accuracy_score(predicted, y)
print('Accuracy: ', accuracy)
print('CV AUC with CT classifier: ', cross_val_score(model, X, y, cv=39,scoring='roc_auc').mean())

Accuracy:  0.683
CV AUC with CT classifier:  0.6808026096191776


In [55]:
from sklearn.naive_bayes import GaussianNB

# Build a Gaussian Classifier
NBmodel = GaussianNB()

# Model training
NBmodel.fit(X.toarray(), y)

predicted = NBmodel.predict(X.toarray())
accuracy = accuracy_score(predicted, y)
print('Accuracy: ', accuracy)

# Evaluation using cross validation AUC
cv_auc = cross_val_score(NBmodel, X.toarray(), y, cv=39,scoring='roc_auc').mean()
print('CV AUC with NB classifier', cv_auc)

Accuracy:  0.8085
CV AUC with NB classifier 0.858550295857988
