In [1]:
#CSE, 2016118274 HyunWook, Hong
import pandas as pd
import os
import numpy as np

In [2]:
#1. Data Collection
path = './aclImdb/'
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for name in ('pos','neg'):
        subpath = '%s/%s' %(s,name)
        dirpath = path+subpath
        for file in os.listdir(dirpath):
            with open(os.path.join(dirpath, file), 'r') as f:
                txt = f.read()
            df = df.append([[txt, labels[name]]], ignore_index = True)

df.columns = ['review', 'sentiment']

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
#output
df.to_csv('movie_review.csv',index=False)



In [4]:
df = pd.read_csv('movie_review.csv')
df.head()


Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [5]:
df.tail()

Unnamed: 0,review,sentiment
49995,the people who came up with this are SICK AND ...,0
49996,"The script is so so laughable... this in turn,...",0
49997,"""So there's this bride, you see, and she gets ...",0
49998,Your mind will not be satisfied by this nobud...,0
49999,The chaser's war on everything is a weekly sho...,1


In [18]:
#2. Subjectivity Detection
import re

def preprocessor(text):
    text = re.sub('<[^>]*>','', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)|\^.?\^', text)
    text = re.sub('[\₩]+', ' ', text.lower()) + ' '.join(emoticons).replace('-','')

    return text

df = pd.read_csv('movie_review.csv')
print('start preprocessing')
df['review']= df['review'].apply(preprocessor)
print('complete')

df.to_csv('./refined_movie_review.csv',index=False)

start preprocessing
complete


In [6]:
df = pd.read_csv('refined_movie_review.csv')
df.head()

Unnamed: 0,review,sentiment
0,my family and i normally do not watch local mo...,1
1,"believe it or not, this was at one time the wo...",0
2,"after some internet surfing, i found the ""home...",0
3,one of the most unheralded great works of anim...,1
4,"it was the sixties, and anyone with long hair ...",0


In [7]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

porter = PorterStemmer()
stop = stopwords.words('english')

def tokenizer(text):
    return text.split()

def tokeinzer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [31]:
text = 'runners like running and thus they run'
print("Original text")
print(text)
print("blank split")
print(tokenizer(text))
print("stemming split" )
print(tokeinzer_porter(text))

Original text
runners like running and thus they run
blank split
['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
stemming split
['runner', 'like', 'run', 'and', 'thu', 'they', 'run']


In [9]:
#Polarity Detection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import os
import pickle
from time import time

df = pd.read_csv('refined_movie_review.csv')

x_train = df.loc[:35000,'review'].values
y_train = df.loc[:35000,'sentiment'].values
x_test = df.loc[15000:,'review'].values
y_test = df.loc[15000:,'sentiment'].values

tfidf = TfidfVectorizer(lowercase=False,tokenizer=tokenizer)
#Pipeline : Run vect(tfidf), clf(logisticRegression) sequentially
lr_tfidf = Pipeline([('vect',tfidf), ('clf', LogisticRegression(C=10.0, penalty='l2',random_state=0))])

print("ML start")
#train
lr_tfidf.fit(x_train,y_train)
print("ML complete")

y_pred = lr_tfidf.predict(x_test)
print('Accuracy : %.3f' %accuracy_score(y_test, y_pred))

curDir = os.getcwd()
dest = os.path.join(curDir, 'data', 'pklObject')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(lr_tfidf, open(os.path.join(dest,'classifier.pkl'), 'wb'), protocol=4)
print('pickle save')


ML start
ML complete
Accuracy : 0.954
pickle save


In [11]:
df = pd.read_csv('refined_movie_review.csv')

x_train = df.loc[:35000,'review'].values
y_train = df.loc[:35000,'sentiment'].values
x_test = df.loc[15000:,'review'].values
y_test = df.loc[15000:,'sentiment'].values

curDir = os.getcwd()
clf = pickle.load(open(os.path.join(curDir,'data','pklObject','classifier.pkl'),'rb'))

y_pred = clf.predict(x_test)
print('Test accuracy: %.3f' %accuracy_score(y_test, y_pred))

label = {0:'negative opinion', 1:'postivie opinion'}

while True:
    txt = input('write review in english')
    if txt =='':
        break
    example = [txt]
    print('writed txt : '+txt)
    print('predict : %s \nStatics: %.3f%%' %(label[clf.predict(example)[0]], np.max(clf.predict_proba(example))*100))
    print('\n')

Test accuracy: 0.954
writed txt : i love this movie
predict : postivie opinion 
Statics: 97.536%


writed txt : i like it
predict : postivie opinion 
Statics: 86.165%


writed txt : i hate this contents
predict : negative opinion 
Statics: 54.279%


writed txt : i love it but scenario is too bad
predict : negative opinion 
Statics: 85.417%


writed txt : hmm.. not good!
predict : postivie opinion 
Statics: 68.064%


writed txt : hmm.. not good
predict : postivie opinion 
Statics: 55.343%


writed txt : i will recommand this movie to my friends
predict : postivie opinion 
Statics: 79.363%


writed txt : thakn you sir
predict : postivie opinion 
Statics: 92.385%


