In [2]:
import pandas as pd

df = pd.read_csv(r"/Users/mattmeredith/Downloads/IMDB Dataset.csv")


In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df_positive = df[df['sentiment']=='positive'][:9000]
df_negative = df[df['sentiment']=='negative'][:1000]
df_imb = pd.concat([df_positive, df_negative])

df_imb.value_counts(['sentiment'])

sentiment
positive     9000
negative     1000
dtype: int64

In [5]:
#rus = RandomUnderSampler(random_state=0)
#df_bal, df_bal['sentiment'] = rus.fit_resample(df_imb[['review']], df_imb['sentiment'])
#df_bal

# option 2
length_negative = len(df_imb[df_imb['sentiment']=='negative'])
df_review_positive = df_imb[df_imb['sentiment']=='positive'].sample(n=length_negative)
df_review_non_positive = df_imb[~(df_imb['sentiment']=='positive')]

df_bal = pd.concat([
     df_review_positive, df_review_non_positive
 ])

df_bal.reset_index(drop=True, inplace=True)
df_bal['sentiment'].value_counts()

positive    1000
negative    1000
Name: sentiment, dtype: int64

In [6]:
df_bal.shape

(2000, 2)

In [7]:
df_bal.iloc[999]

review       This film stands head and shoulders above the ...
sentiment                                             positive
Name: 999, dtype: object

In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_bal, test_size=0.33, random_state=42)
# Splits dataframe randomly into seperate train set and test set

# Below we isolate x and y as input and output to later train algorithms
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

In [9]:
# Term frequency TF-IDF, turns text data into numerical vectors
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# applies a weight to each proper word in proportion to its use across dataframe
tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
train_x_vector
# output => 1340 instances containing 20011 discretely weighted words

<1340x20167 sparse matrix of type '<class 'numpy.float64'>'
	with 115626 stored elements in Compressed Sparse Row format>

In [None]:
test_x_vector = tfidf.transform(test_x)
test_x_vector

# Algorithms

In [12]:
# Support Vector Machines 
from sklearn.svm import SVC 
svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y) 
# takes input vectors and output as arguments and predicts output with .predict method

print(svc.predict(tfidf.transform(['A great watch overall'])))
print(svc.predict(tfidf.transform(['What an appalling film'])))
print(svc.predict(tfidf.transform(['A bunch of nonsense - do not recommend'])))
print(svc.predict(tfidf.transform(['I absolutely loved it!'])))

['positive']
['negative']
['negative']
['positive']


In [13]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier

dectree = DecisionTreeClassifier()
dectree.fit(train_x_vector, train_y)

In [14]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

In [15]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_x_vector, train_y)

# Evaluation

In [22]:
# Mean accuracy of each algorithm with .score()
print(svc.score(test_x_vector, test_y))
print(dectree.score(test_x_vector, test_y))
print(gnb.score(test_x_vector.toarray(), test_y))
print(log_reg.score(test_x_vector, test_y))

0.8272727272727273
0.6712121212121213
0.6212121212121212
0.8181818181818182


In [29]:
from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector),
         labels=['positive', 'negative'], 
         average=None)

# f1_score(y_true, y_pred, average=None)

array([0.82934132, 0.82515337])

In [31]:
from sklearn.metrics import classification_report

print(classification_report(test_y,
                           svc.predict(test_x_vector),
                           labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.81      0.85      0.83       325
    negative       0.85      0.80      0.83       335

    accuracy                           0.83       660
   macro avg       0.83      0.83      0.83       660
weighted avg       0.83      0.83      0.83       660



In [33]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test_y,
                           svc.predict(test_x_vector),
                           labels=['positive', 'negative'])
conf_mat

array([[277,  48],
       [ 66, 269]])