In [170]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import nltk
import re
import string
import time
from bs4 import BeautifulSoup
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score

In [171]:
data = pd.read_csv('data/IMDB Dataset.csv')

In [172]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [173]:
data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [174]:
review = data['review'][1]

In [175]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

In [176]:
def clean_text_tokenized(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove punctuation
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    # Tokenize
    tokens = re.split('\W+', text)
    # Stem
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

## Train/Test Split

In [177]:
#data = data[0::10]

In [178]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2)

## Inverse docuement frquency weighting (TF IDF)

In [179]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text_tokenized)

tfidf_vect_fit = tfidf_vect.fit(X_train)

X_train_vect = tfidf_vect_fit.transform(X_train)
X_test_vect = tfidf_vect_fit.transform(X_test)

## LinearSVC

In [180]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

linear_svc = CalibratedClassifierCV(LinearSVC(C=0.5, random_state=42))
linear_svc_model = linear_svc.fit(X_train_vect, y_train)

y_pred = linear_svc_model.predict(X_test_vect)

precision, recall, fscore, support = score(y_test, y_pred, pos_label='positive', average='binary')
accuracy = (y_pred==y_test).sum() / len(y_pred)

print('Fit time: {} / Predict time: {}'.format(round(fit_time, 3), round(pred_time, 3)))
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

Fit time: 2.065 / Predict time: 0.446
Precision: 0.891 / Recall: 0.906 / Accuracy: 0.899


In [181]:
x = tfidf_vect.transform(['awesome movie, but somehwere I got lost'])
x = pd.DataFrame(x.toarray())

print(linear_svc_model.predict(x))
print(linear_svc_model.predict_proba(x)[0][1])

['positive']
0.8856260140187444


## Pipeline

In [182]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=clean_text_tokenized)), 
                           ('model', CalibratedClassifierCV(LinearSVC(C=0.5, random_state=42)))])
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer=<function clean_text_tokenized at 0x0000015F1CDEFD90>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_acc...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 CalibratedClassifierCV(base_estimator=LinearSVC(C=0.5,
                                                                 class_weight=None,
  

In [183]:
pipeline.predict_proba(['awesome movie, but somehwere I got lost'])

array([[0.11437399, 0.88562601]])

In [184]:
from joblib import dump
dump(pipeline, filename="svc_pipeline.joblib")

['svc_pipeline.joblib']

In [185]:
from joblib import load
pipe = load('svc_pipeline.joblib')

In [186]:
pred = pipe.predict_proba(['awesome movie, but somehwere I got lost'])[0][1]
pred

0.8856260140187446