In [16]:
import numpy as np
import pandas as pd
import nltk
import sklearn

In [17]:
print(np.__version__)
print(pd.__version__)
print(nltk.__version__)
print(sklearn.__version__)

1.13.1
0.20.3
3.2.3
0.18.1


In [3]:
DATASET_PATH = './dataset'

In [7]:
train_data_df = pd.read_csv(DATASET_PATH + '/training.txt', header=None, delimiter="\t", quoting=3)
train_data_df.columns = ["Sentiment", "Text"]

test_data_df = pd.read_csv(DATASET_PATH + '/testdata.txt', header=None, delimiter="\t", quoting=3)
test_data_df.columns = ["Text"]

In [8]:
train_data_df.head()

Unnamed: 0,Sentiment,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [9]:
train_data_df.shape, test_data_df.shape

((7086, 2), (33052, 1))

In [10]:
train_data_df.Sentiment.value_counts()

1    3995
0    3091
Name: Sentiment, dtype: int64

caculate the average number of words per sentence

In [12]:
np.mean([len(s.split(" ")) for s in train_data_df.Text])

10.886819079875812

## Preparing a corpus

In [15]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

In [21]:
#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    #remove non leters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    #tokenize
    tokens = nltk.word_tokenize(text)
    
    #stem
    stems = stem_tokens(tokens, stemmer)
    return stems

In [22]:
vectorizer = CountVectorizer(analyzer='word',
                             tokenizer=tokenize,
                             lowercase=True,
                             stop_words="english",
                             max_features=85)

In [23]:
corpus_data_features = vectorizer.fit_transform(
                                    train_data_df.Text.tolist()+test_data_df.Text.tolist())

In [25]:
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape

(40138, 85)

the words in the vocabulary

In [24]:
vocab = vectorizer.get_feature_names()
print(vocab)

['aaa', 'amaz', 'angelina', 'awesom', 'beauti', 'becaus', 'boston', 'brokeback', 'citi', 'code', 'cool', 'cruis', 'd', 'da', 'drive', 'francisco', 'friend', 'fuck', 'geico', 'good', 'got', 'great', 'ha', 'harri', 'harvard', 'hate', 'hi', 'hilton', 'honda', 'imposs', 'joli', 'just', 'know', 'laker', 'left', 'like', 'littl', 'london', 'look', 'lot', 'love', 'm', 'macbook', 'make', 'miss', 'mission', 'mit', 'mountain', 'movi', 'need', 'new', 'oh', 'onli', 'pari', 'peopl', 'person', 'potter', 'purdu', 'realli', 'right', 'rock', 's', 'said', 'san', 'say', 'seattl', 'shanghai', 'stori', 'stupid', 'suck', 't', 'thi', 'thing', 'think', 'time', 'tom', 'toyota', 'ucla', 've', 'vinci', 'wa', 'want', 'way', 'whi', 'work']


print the counts of each words in the vocabulary

In [27]:
dist = np.sum(corpus_data_features_nd, axis=0)

for tag, count in zip(vocab, dist):
    print(count, tag)

1179 aaa
485 amaz
1765 angelina
3170 awesom
2146 beauti
1694 becaus
2190 boston
2000 brokeback
423 citi
2003 code
481 cool
2031 cruis
439 d
2087 da
433 drive
1926 francisco
477 friend
452 fuck
1085 geico
773 good
571 got
1178 great
776 ha
2094 harri
2103 harvard
4492 hate
794 hi
2086 hilton
2192 honda
1098 imposs
1764 joli
1054 just
896 know
2019 laker
425 left
4080 like
507 littl
2233 london
811 look
421 lot
10334 love
1568 m
1059 macbook
631 make
1098 miss
1101 mission
1340 mit
2081 mountain
1207 movi
1220 need
459 new
551 oh
674 onli
2094 pari
1018 peopl
454 person
2093 potter
1167 purdu
2126 realli
661 right
475 rock
3914 s
495 said
2038 san
627 say
2019 seattl
1189 shanghai
467 stori
2886 stupid
4614 suck
1455 t
1705 thi
662 thing
1524 think
781 time
2117 tom
2028 toyota
2008 ucla
774 ve
2001 vinci
3703 wa
1656 want
932 way
547 whi
512 work


## A bag-of-words linear classifier

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(corpus_data_features_nd[:len(train_data_df)],
                                                    train_data_df.Sentiment,
                                                    train_size=0.85,
                                                    random_state=1234)

In [31]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X=X_train, y=y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
y_pred = log_model.predict(X_test)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98       467
          1       0.99      0.98      0.99       596

avg / total       0.98      0.98      0.98      1063



Finally, we can re-train our model with all training data and use if for sentiment classification

In [37]:
# train classifier
log_mode = LogisticRegression()
log_model.fit(X=corpus_data_features_nd[:len(train_data_df)], y=train_data_df.Sentiment)

test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])

import random
spl = random.sample(range(len(test_pred)), 15)

for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
    print(sentiment, text)

1 I love MIT so much...
0 And I'll never get whad she's saying with her stupid Shanghai accent! >:(
1 Previously, I have installed Windoze just so I can show my PC faithful friends how awesome MacBook Pro is.
1 I WANT MIT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!..
1 i love our trips out to london pissed out of our faces on shitty booze...
0 Boston SUCKS.
0 Besides, we need at least one ex-boxer in Boston...
1 I liked Tom Cruise until he dumped Nicole Kidman.
0 i hate the Lakers too but this isn't a basketball blog, so i won't go into it).
0 I understand that she'll be taking off to work for Southwest Airlines, which is fantastic too.
0 Well apparently all people driving toyota 4runners are appalingly ugly, or so one would think based on him not even glancing in my direction!
0 I HATE LONDON!..
1 angelina jolie is so beautiful that i don't even have the desire to attain such exquisite beauty..
0 we have a boring as shit blue 2005 toyota carolla.
1 I'm loving Shanghai > > > ^ _ ^.
