In [1]:
import sys
sys.executable

'/anaconda3/envs/kaggle_nlp/bin/python'

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [70]:
data = pd.read_csv('./word2vec-nlp-tutorial/labeledTrainData.tsv', header=0,
                   delimiter='\t', quoting=3)

In [72]:
split_index = train_test_split(data.index, test_size=0.20)
train = data.iloc[split_index[0],:]
validation = data.iloc[split_index[1],:]

In [74]:
print(train.shape)
print(validation.shape)

(20000, 3)
(5000, 3)


In [75]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [88]:
train.iloc[0,:].review

'"This is a case of a bunch of people thinking they are so clever they have a story that fits the time. Remember the all-around political conspiracies caught on camera in the years leading to the Watergate and a little later? Most movies trying to cash in on made-for-TV \'o so powerful, o so mind numbing\' conspiracies were in fact caught in their own navel-gazing attitude.<br /><br />I was never a fan of The Conversation which I find as much dated as others conspiracy stints of the time but Coppola was true to his main character and Hackman was a pretty engaging actor to observe. I mean these conspiracy movies are mostly drowning in the character pool of noir heroes. Lots of questions unanswered, lots of dis-communication... Well this takes at least Bergman to build a movie about such un-visual bases.<br /><br />The Next Man is a perfect example of its time: one political soup served with an idealistic character and an horrendous conspiracy tightening its web around him. Neither part 

# Data cleaning and text preprocessing

In [None]:
#import nltk
#nltk.download() #May be done once per computer

In [7]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [30]:
def remove_html(raw_text):
    return BeautifulSoup(raw_text).get_text()


def remove_non_english_letters(text):
    return re.sub('[^A-Za-z]', ' ', text)


def convert_to_lower_case(text):
    return text.lower()


def split_text_into_individual_words(text):
    return text.split()


def remove_stopwords(words):
    def set_of_stopwords():
        return set(stopwords.words('english'))
    
    english_stopwords = set_of_stopwords()
    return [w for w in words if not w in english_stopwords]


def words_to_one_string(words):
    return ' '.join(words)


def clean_and_process_text(raw_text):
    text = remove_html(raw_text)
    only_letters = remove_non_english_letters(text)
    lower_case = convert_to_lower_case(only_letters)
    words = split_text_into_individual_words(lower_case)
    important_words = remove_stopwords(words)
    return words_to_one_string(important_words)

In [90]:
train['clean_review'] = train.review.apply(clean_and_process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [91]:
train.head()

Unnamed: 0,id,sentiment,review,clean_review
8053,"""12275_2""",0,"""This is a case of a bunch of people thinking ...",case bunch people thinking clever story fits t...
3192,"""9288_8""",1,"""I found this to be a surprisingly light-hande...",found surprisingly light handed touch culture ...
3529,"""7469_10""",1,"""This film was just absolutly brilliant. It ac...",film absolutly brilliant actually made think w...
14853,"""8591_7""",1,"""This was a pretty decent movie. This movie is...",pretty decent movie movie good sit watch enter...
16228,"""9540_8""",1,"""Most of the feedback I've heard concerning Me...",feedback heard concerning meatball machine pre...


# Create Features from Bag of Words

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [92]:
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)
train_features = vectorizer.fit_transform(train['clean_review'])
train_features = pd.DataFrame(train_features.toarray())
train_features.columns = vectorizer.get_feature_names()

In [93]:
train_features.head()

Unnamed: 0,abandoned,abc,abilities,ability,able,abraham,absence,absent,absolute,absolutely,...,york,young,younger,youngest,youth,zero,zizek,zombie,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [94]:
train_features.sum(axis=0)

abandoned        149
abc              105
abilities         87
ability          353
able            1026
abraham           64
absence           96
absent            65
absolute         280
absolutely      1208
absurd           250
abuse            152
abusive           70
abysmal           77
academy          246
accent           392
accents          170
accept           241
acceptable        96
accepted         119
access            78
accident         271
accidentally     156
acclaimed         63
accompanied       68
accomplished      94
according        227
account          141
accuracy          64
accurate         229
                ... 
writer           931
writers          533
writes            73
writing         1024
written         1329
wrong           1488
wrote            469
ww                74
wwe               65
wwii             129
ya                85
yeah             355
year            1867
years           3622
yelling           63
yellow            91
yes          

# Train a random forest model

In [106]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, confusion_matrix

In [95]:
forest = RandomForestClassifier(n_estimators=100)
model = forest.fit(train_features, train['sentiment'])

# Evaluate model

In [96]:
validation.shape

(5000, 3)

In [97]:
validation['clean_review'] = validation.review.apply(clean_and_process_text)
validation_features = vectorizer.transform(validation['clean_review'])
validation_features = pd.DataFrame(validation_features.toarray())
validation_features.columns = vectorizer.get_feature_names()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [98]:
validation_features.head()

Unnamed: 0,abandoned,abc,abilities,ability,able,abraham,absence,absent,absolute,absolutely,...,york,young,younger,youngest,youth,zero,zizek,zombie,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
result = model.predict(validation_features)

In [117]:
print(r2_score(validation.sentiment, result))
print(confusion_matrix(validation.sentiment, result))

0.40798853865810847
[[2152  337]
 [ 403 2108]]
