In [1]:
!pip install liac-arff




[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import requests
import arff

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# get data from online
training_arff = requests.get('https://utexas.box.com/shared/static/uxrdafsfgdxwin16hw2e2x5f3x4frj8h.arff')
# read as arff file
movie_arff = arff.load(training_arff.text)

# get the attribute names
col_val = [attribute[0] for attribute in movie_arff['attributes']]

# create a pandas dataframe based on data and attribute names
movie_df = pd.DataFrame(movie_arff['data'], columns = col_val)

# Separate each of the reviews into a list
text_data = movie_df['text'].tolist()

# Create the CountVectorizer object
# CountVectorizer is used to convert a collection of text documents to a vector of term/token counts.
vectorizer = CountVectorizer(binary=True,max_features=1000)

# Fit the vectorizer to the text data
vectorizer.fit(text_data)

# Transform the text data to a bag of words representation
bag_of_words = vectorizer.transform(text_data)

# Print the bag of words representation
# Each row is a word, and each column is a review, if the word is in the review, the value is 1, otherwise 0
print(bag_of_words.toarray())
# Print an array of all the words
vectorizer.get_feature_names_out()

[[1 0 0 ... 1 0 1]
 [0 0 0 ... 1 0 1]
 [0 0 0 ... 1 1 0]
 ...
 [1 1 0 ... 1 1 1]
 [1 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 1]]


array(['10', 'ability', 'able', 'about', 'above', 'absolutely', 'across',
       'act', 'acting', 'action', 'actor', 'actors', 'actress', 'actual',
       'actually', 'add', 'after', 'again', 'against', 'age', 'agent',
       'ago', 'air', 'alien', 'all', 'almost', 'alone', 'along',
       'already', 'also', 'although', 'always', 'am', 'amazing',
       'america', 'american', 'among', 'amount', 'amusing', 'an', 'and',
       'annoying', 'another', 'any', 'anyone', 'anything', 'anyway',
       'apparently', 'appear', 'appearance', 'appears', 'are', 'aren',
       'around', 'art', 'as', 'ask', 'asks', 'aspect', 'at', 'atmosphere',
       'attempt', 'attempts', 'attention', 'audience', 'audiences',
       'away', 'awful', 'back', 'background', 'bad', 'based', 'basically',
       'battle', 'be', 'beautiful', 'because', 'become', 'becomes',
       'been', 'before', 'begin', 'beginning', 'begins', 'behind',
       'being', 'believable', 'believe', 'best', 'better', 'between',
       'beyond'

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# print(bag_of_words.toarray())
X = bag_of_words
y = movie_df.iloc[:,1]
model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5).fit(X, y)

# Calculate the AUC score using cross-validation
auc_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

# Print the AUC scores for each fold of cross-validation
print("AUC score Classification Tree:", auc_scores.mean())

AUC score Classification Tree: 0.655905


In [5]:
# Need MultinomialNB and not GaussianNB because the data is not continuous
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X, y)
auc_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print("AUC scores for MultinomialNB 5 fold of cross-validation:", auc_scores.mean())

AUC scores for MultinomialNB 5 fold of cross-validation: 0.8937200000000001


Naive Bayes with top 250 words using Chi2

In [6]:
# Naive Bayes with top 250 words using Chi2
from sklearn.feature_selection import SelectKBest, chi2
X_new = SelectKBest(chi2, k=250).fit_transform(X, y)

model = MultinomialNB(force_alpha=True).fit(X_new, y)
auc_scores = cross_val_score(model, X_new, y, cv=5, scoring='roc_auc')
print("AUC scores for MultinomialNB with top k words:", auc_scores.mean())

AUC scores for MultinomialNB with top k words: 0.921095


Naive Bayes with top 250 words using info gain

In [7]:
# Naive Bayes with top 250 words using info gain
from sklearn.feature_selection import SelectKBest, mutual_info_classif
X_new = SelectKBest(mutual_info_classif, k=250).fit_transform(X, y)

model = MultinomialNB(force_alpha=True).fit(X_new, y)
auc_scores = cross_val_score(model, X_new, y, cv=5, scoring='roc_auc')
print("AUC scores for MultinomialNB 5 fold of cross-validation:", auc_scores.mean())

AUC scores for MultinomialNB 5 fold of cross-validation: 0.92344


Naive bayes with wrapper method (can take long time)

Wrapping without specifying n_features_to_select

In [8]:
# Naive Bayes with top 250 words using info gain
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict

folds=5
X_new = SelectKBest(mutual_info_classif, k=250).fit_transform(X, y)
model = MultinomialNB(force_alpha=True).fit(X_new, y)
# Calculate the AUC score using cross-validation
a=roc_auc_score(y, cross_val_predict(model, X, y,method='predict_proba', cv=folds)[:, 1])


# SFS is a wrapper method that uses a base model to find the best combination of features
from sklearn.feature_selection import SequentialFeatureSelector as SFS

#Create the wrapping model using MultinomialNB as the base model
#n_jobs is the number of cores to use, -1 means all cores
sfs = SFS(model,direction='forward',scoring='roc_auc',cv=folds,n_jobs=-1)

# Fit the model and find the best combination of features
sfs1 = sfs.fit(X, y)
# Change X to include only the selected features from the wrapping model 
X1=sfs1.transform(X)

# Fit a naive bayes model again but using only the selected features after wrapping
model = MultinomialNB(force_alpha=True).fit(X1, y)
# Calculate the AUC score using cross-validation
print(cross_val_score(model, X1, y, cv=folds,scoring='roc_auc').mean())
a1=roc_auc_score(y, cross_val_predict(model, X1, y,method='predict_proba', cv=folds)[:, 1])



0.9562900000000001


In [53]:
print('AUC score for MultinomialNB with top 250 features:', a)
print('AUC score for wrapping with MultinomialNB:', a1)

AUC score for MultinomialNB with top 250 features: 0.8936250000000001
AUC score for wrapping with MultinomialNB: 0.954351


In [55]:
# Print the names of the features selected by the wrapping model
vectorizer.get_feature_names_out()[sfs1.get_support()]

array(['10', 'ability', 'about', 'above', 'across', 'act', 'actor',
       'actual', 'after', 'against', 'ago', 'alien', 'all', 'also',
       'amazing', 'america', 'american', 'amount', 'an', 'and',
       'annoying', 'another', 'any', 'anyway', 'appear', 'appearance',
       'appears', 'are', 'around', 'as', 'at', 'attempt', 'attempts',
       'attention', 'audiences', 'away', 'awful', 'back', 'bad', 'based',
       'battle', 'be', 'because', 'become', 'becomes', 'before', 'begin',
       'beginning', 'being', 'believable', 'believe', 'bill', 'bit',
       'body', 'book', 'boring', 'box', 'boy', 'brilliant', 'bring',
       'brother', 'brought', 'by', 'came', 'camera', 'care', 'career',
       'casting', 'change', 'character', 'characters', 'cheap', 'cinema',
       'cinematic', 'city', 'class', 'clever', 'close', 'cold', 'come',
       'comic', 'coming', 'common', 'company', 'complete', 'could',
       'couldn', 'couple', 'credit', 'crew', 'crime', 'daughter', 'days',
       'de', '

Wrapping with n_features_to_select = 50

In [62]:
# Naive Bayes with top 250 words using info gain
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict

folds=5
X_new = SelectKBest(mutual_info_classif, k=250).fit_transform(X, y)
model = MultinomialNB(force_alpha=True).fit(X_new, y)
# Calculate the AUC score using cross-validation
a=roc_auc_score(y, cross_val_predict(model, X, y,method='predict_proba', cv=folds)[:, 1])


# SFS is a wrapper method that uses a base model to find the best combination of features
from sklearn.feature_selection import SequentialFeatureSelector as SFS

#Create the wrapping model using MultinomialNB as the base model
#n_jobs is the number of cores to use, -1 means all cores
#we select the top 50 features
sfs = SFS(model,n_features_to_select=50,direction='forward',scoring='roc_auc',cv=folds,n_jobs=-1)

# Fit the model and find the best combination of features
sfs1 = sfs.fit(X, y)
# Change X to include only the selected features from the wrapping model 
X1=sfs1.transform(X)

# Fit a naive bayes model again but using only the selected features after wrapping
model = MultinomialNB(force_alpha=True).fit(X1, y)
# Calculate the AUC score using cross-validation
print(cross_val_score(model, X1, y, cv=folds,scoring='roc_auc').mean())
a1=roc_auc_score(y, cross_val_predict(model, X1, y,method='predict_proba', cv=folds)[:, 1])

0.9045


In [63]:
print('AUC score for MultinomialNB with top 250 features:', a)
print('AUC score for wrapping with MultinomialNB (n_features_to_select=50):', a1)

AUC score for MultinomialNB with top 250 features: 0.8936250000000001
AUC score for wrapping with MultinomialNB (n_features_to_select=50): 0.9034559999999999


In [64]:
# Print the names of the features selected by the wrapping model
vectorizer.get_feature_names_out()[sfs1.get_support()]

array(['10', 'america', 'attempt', 'awful', 'bad', 'boring', 'definitely',
       'delivers', 'dull', 'easily', 'especially', 'extremely', 'fails',
       'follow', 'great', 'have', 'hilarious', 'history', 'life', 'many',
       'material', 'maybe', 'memorable', 'mess', 'modern', 'none',
       'nothing', 'oscar', 'others', 'overall', 'perfect', 'perfectly',
       'performances', 'powerful', 'quite', 'ridiculous', 'script',
       'sequel', 'shows', 'sometimes', 'stupid', 'subtle', 'supposed',
       'today', 'tries', 'unfortunately', 'very', 'waste', 'wonderful',
       'worst'], dtype=object)

Wrapping with n_features_to_select = 'auto' and tol = 0.0005

In [69]:
# Naive Bayes with top 250 words using info gain
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict

folds=5
X_new = SelectKBest(mutual_info_classif, k=250).fit_transform(X, y)
model = MultinomialNB(force_alpha=True).fit(X_new, y)
# Calculate the AUC score using cross-validation
a=roc_auc_score(y, cross_val_predict(model, X, y,method='predict_proba', cv=folds)[:, 1])


# SFS is a wrapper method that uses a base model to find the best combination of features
from sklearn.feature_selection import SequentialFeatureSelector as SFS

#Create the wrapping model using MultinomialNB as the base model
#n_jobs is the number of cores to use, -1 means all cores
#tol is the tolerance for the stopping criteria, we set it to 0.0005 here
sfs = SFS(model,n_features_to_select='auto',tol = 0.0005,direction='forward',scoring='roc_auc',cv=folds,n_jobs=-1)

# Fit the model and find the best combination of features
sfs1 = sfs.fit(X, y)
# Change X to include only the selected features from the wrapping model 
X1=sfs1.transform(X)

# Fit a naive bayes model again but using only the selected features after wrapping
model = MultinomialNB(force_alpha=True).fit(X1, y)
# Calculate the AUC score using cross-validation
print(cross_val_score(model, X1, y, cv=folds,scoring='roc_auc').mean())
a1=roc_auc_score(y, cross_val_predict(model, X1, y,method='predict_proba', cv=folds)[:, 1])

0.92978


In [70]:
print('AUC score for MultinomialNB with top 250 features:', a)
print('AUC score for wrapping with MultinomialNB (tol = 0.0005):', a1)

AUC score for MultinomialNB with top 250 features: 0.8936250000000001
AUC score for wrapping with MultinomialNB (tol = 0.0005): 0.9286179999999999


In [71]:
# Print the names of the features selected by the wrapping model
vectorizer.get_feature_names_out()[sfs1.get_support()]

array(['10', 'above', 'also', 'america', 'american', 'attempt',
       'attention', 'awful', 'bad', 'become', 'boring', 'change', 'cheap',
       'class', 'could', 'definitely', 'delivers', 'different', 'dull',
       'easily', 'ending', 'entertaining', 'especially', 'excellent',
       'expected', 'extremely', 'fails', 'female', 'flat', 'follow',
       'great', 'have', 'hilarious', 'history', 'hour', 'known', 'life',
       'many', 'material', 'may', 'maybe', 'memorable', 'mess', 'modern',
       'most', 'neither', 'none', 'nothing', 'only', 'oscar', 'others',
       'overall', 'perfect', 'perfectly', 'performances', 'poor',
       'potential', 'powerful', 'quite', 'reason', 'ridiculous', 'script',
       'sequel', 'should', 'shows', 'simple', 'sometimes', 'stupid',
       'subtle', 'supposed', 'today', 'tom', 'town', 'tries',
       'unfortunately', 'very', 'visual', 'waste', 'watching',
       'wonderful', 'worst', 'yet'], dtype=object)