In [17]:
import numpy as np
import pandas as pd
from collections import Counter
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
data = pd.read_csv('data.tsv', delimiter='\t')

In [3]:
data.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


# Cleaning data

#### Remove html tag, stopword

In [24]:
def cleanHTML(text):
    return BeautifulSoup(text).get_text()

In [32]:
data['review'] = data['review'].apply(cleanHTML)

In [11]:
english_stopwords = set(stopwords.words('english'))

In [12]:
def removeStopword(text):
    res = []
    for word in text.split(' '):
        if word not in english_stopwords:
            res.append(word)
    return ' '.join(res).strip()

In [13]:
data['review'] = data['review'].apply(removeStopword)

In [20]:
def tokenize_porter(text):
    porter = PorterStemmer()
    res = []
    for word in text.split():
        res.append(porter.stem(word))
    return ' '.join(res).strip()

In [23]:
data['review'] = data['review'].apply(tokenize_porter)

# Machine learning

### Train - test split

In [25]:
from sklearn.model_selection import train_test_split

X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### tf-idf

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

### training

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [29]:
model = Pipeline([('vectorize', tfidf), ('regression', LogisticRegression(random_state=0))])

In [30]:
model.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vectorize',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('regression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, interc

In [31]:
y_predict = model.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Now apply those above metrics to evaluate your model
accuracy_score(y_test, y_predict)
confusion_matrix(y_test, y_predict)
classification_report(y_test, y_predict).split('\n')

['              precision    recall  f1-score   support',
 '',
 '           0       0.90      0.86      0.87      2478',
 '           1       0.86      0.90      0.88      2522',
 '',
 '    accuracy                           0.88      5000',
 '   macro avg       0.88      0.88      0.88      5000',
 'weighted avg       0.88      0.88      0.88      5000',
 '']