In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('./word2vec-nlp-tutorial/labeledTrainData.tsv', header=0,
                   delimiter='\t', quoting=3)

In [3]:
split_index = train_test_split(data.index, test_size=0.20)
train = data.iloc[split_index[0],:]
validation = data.iloc[split_index[1],:]

In [4]:
print(train.shape)
print(validation.shape)

(20000, 3)
(5000, 3)


In [5]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [6]:
train.iloc[0,:].review

'"I gave it an 8 only because it had received such low votes... this is definitely really about a 5.5..... Ummm.. it was kind of bloody, had likeable, shallow characters, and it had some really hot babes in it. I like the eclectic killer, because he didn\'t kill people the same way everytime... that sometimes gets old."'

# Data cleaning and text preprocessing

In [7]:
#import nltk
#nltk.download() #May be done once per computer

In [8]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [9]:
def remove_html(raw_text):
    return BeautifulSoup(raw_text).get_text()


def remove_non_english_letters(text):
    return re.sub('[^A-Za-z]', ' ', text)


def convert_to_lower_case(text):
    return text.lower()


def split_text_into_individual_words(text):
    return text.split()


def remove_stopwords(words):
    def set_of_stopwords():
        return set(stopwords.words('english'))
    
    english_stopwords = set_of_stopwords()
    return [w for w in words if not w in english_stopwords]


def words_to_one_string(words):
    return ' '.join(words)


def clean_and_process_text(raw_text):
    text = remove_html(raw_text)
    only_letters = remove_non_english_letters(text)
    lower_case = convert_to_lower_case(only_letters)
    words = split_text_into_individual_words(lower_case)
    important_words = remove_stopwords(words)
    return words_to_one_string(important_words)

In [10]:
train['clean_review'] = train.review.apply(clean_and_process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
train.head()

Unnamed: 0,id,sentiment,review,clean_review
5276,"""342_10""",1,"""I gave it an 8 only because it had received s...",gave received low votes definitely really ummm...
2224,"""1842_1""",0,"""This is my first movie review on IMDb. I was ...",first movie review imdb forced register watchi...
21359,"""2722_8""",1,"""This recreation of the infamous 1959 murders ...",recreation infamous murders kansas based capot...
10775,"""1515_1""",0,"""I watched the movie \""The Flock\"" because of ...",watched movie flock casting gere danes story s...
23244,"""7486_4""",0,"""In an apparent attempt to avoid remaking the ...",apparent attempt avoid remaking original movie...


# Create Features from Bag of Words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)
train_features = vectorizer.fit_transform(train['clean_review'])
train_features = pd.DataFrame(train_features.toarray())
train_features.columns = vectorizer.get_feature_names()

In [14]:
train_features.head()

Unnamed: 0,abandoned,abc,abilities,ability,able,abraham,absence,absent,absolute,absolutely,...,yet,york,young,younger,youth,zero,zizek,zombie,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train_features.sum(axis=0)

abandoned        153
abc               99
abilities         90
ability          374
able            1022
abraham           69
absence           94
absent            65
absolute         284
absolutely      1202
absurd           249
absurdity         63
abuse            147
abusive           74
abysmal           79
academy          227
accent           402
accents          162
accept           237
acceptable       105
accepted         119
access            76
accident         252
accidentally     166
accompanied       66
accomplish        64
accomplished     107
according        240
account          149
accurate         225
                ... 
wretched          69
write            530
writer           926
writers          540
writes            74
writing         1036
written         1264
wrong           1459
wrote            459
ww                70
wwii             124
ya                78
yard              64
yeah             361
year            1911
years           3588
yelling      

# Train a random forest model

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import r2_score, confusion_matrix

In [17]:
forest = RandomForestClassifier(n_estimators=100)
forest_model = forest.fit(train_features, train['sentiment'])

In [18]:
svm_classifier = SVC()
svc_model = svm_classifier.fit(train_features, train['sentiment'])



In [22]:
mNB = MultinomialNB()
mNB_model = mNB.fit(train_features, train['sentiment'])

# Evaluate model

In [19]:
validation.shape

(5000, 3)

In [23]:
validation['clean_review'] = validation.review.apply(clean_and_process_text)
validation_features = vectorizer.transform(validation['clean_review'])
validation_features = pd.DataFrame(validation_features.toarray())
validation_features.columns = vectorizer.get_feature_names()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
validation_features.head()

Unnamed: 0,abandoned,abc,abilities,ability,able,abraham,absence,absent,absolute,absolutely,...,yet,york,young,younger,youth,zero,zizek,zombie,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [25]:
forest_result = forest_model.predict(validation_features)

In [26]:
svc_result = svc_model.predict(validation_features)

In [27]:
mNB_result = mNB_model.predict(validation_features)

In [28]:
for result in [forest_result, svc_result, mNB_result]:
    print(r2_score(validation.sentiment, result))
    print(confusion_matrix(validation.sentiment, result))

0.3751879036378146
[[2150  361]
 [ 420 2069]]
0.3455873305707201
[[1971  540]
 [ 278 2211]]
0.3959883063336108
[[2172  339]
 [ 416 2073]]
