In [1]:
import pandas as pd
import numpy as np
train_data = pd.read_csv('Data\\labeledTrainData.tsv',header=0,delimiter="\t", quoting=3)
test_data = pd.read_csv('Data\\testData.tsv',header=0,delimiter="\t", quoting=3)

In [2]:
X = train_data['review'].values
y = train_data['sentiment'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [3]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lam.Huynh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import re

def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

from nltk.stem import PorterStemmer

porter = PorterStemmer()

# write a function called `tokenizer()` that split a text into list of words
# Your code here
def tokenizer(text):
    res = text.split()
    return res

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

clf = Pipeline([('vect', tfidf),
                ('clf', LogisticRegression(random_state=0))])
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

prediction = clf.predict(X_test)
accuracy_score = accuracy_score(y_test,prediction)
confusion_matrix = confusion_matrix(y_test,prediction)
classification_report = classification_report(y_test,prediction)
print(accuracy_score)
print(confusion_matrix)
print(classification_report)

0.8853333333333333
[[3293  503]
 [ 357 3347]]
             precision    recall  f1-score   support

          0       0.90      0.87      0.88      3796
          1       0.87      0.90      0.89      3704

avg / total       0.89      0.89      0.89      7500



In [7]:
reviews = test_data['review'].values
#preds = clf.predict_proba(reviews)
preds = clf.predict(reviews)

#for i in range(len(reviews)):
#    print(f'{test_data.id[i]} --> Negative, Positive  = {preds[i]}')

In [8]:
#import pickle
#import os
#
#pickle.dump(clf, open(os.path.join('data', 'MoviesReviews.pkl'), 'wb'), protocol=4)

In [12]:
test_data['sentiment'] = preds

Unnamed: 0,id,review,sentiment
0,"""12311_10""","""Naturally in a film who's main themes are of ...",1
1,"""8348_2""","""This movie is a disaster within a disaster fi...",0
2,"""5828_4""","""All in all, this is a movie for kids. We saw ...",1
3,"""7186_2""","""Afraid of the Dark left me with the impressio...",1
4,"""12128_7""","""A very accurate depiction of small time mob l...",1


In [10]:
#test_data.to_csv('Output\\Result.csv',index=False)

In [21]:
test_data['movie_id'] = test_data.id.str.split('_').str[0]
test_data['movie_id'].value_counts()

"1566     2
"4619     2
"3603     2
"1225     2
"3844     2
"8012     2
"4455     2
"2110     2
"3239     2
"1252     2
"8196     2
"5652     2
"2428     2
"10435    2
"11225    2
"736      2
"11254    2
"10810    2
"8853     2
"11371    2
"9709     2
"11953    2
"3722     2
"2412     2
"5591     2
"2765     2
"2836     2
"4590     2
"1538     2
"10056    2
         ..
"7645     2
"2312     2
"7489     2
"4306     2
"1741     2
"4122     2
"9493     2
"9561     2
"108      2
"12067    2
"12439    2
"11740    2
"7751     2
"2082     2
"8582     2
"7529     2
"2255     2
"608      2
"6338     2
"7448     2
"9893     2
"6799     2
"12033    2
"4684     2
"5918     2
"11068    2
"953      2
"7334     2
"7733     2
"2607     2
Name: movie_id, Length: 12500, dtype: int64