In [1]:
import pandas as pd

# (a) Load data

In [2]:
def process_line(line):
    sentence, score = line.split('\t')
    return [sentence.strip(), int(score.strip())]

def read_data(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return map(process_line, lines)

In [3]:
amazon_data = pd.DataFrame(read_data('sentiment_labelled_sentences/amazon_cells_labelled.txt'), columns=['sentence', 'score'])
imdb_data = pd.DataFrame(read_data('sentiment_labelled_sentences/imdb_labelled.txt'), columns=['sentence', 'score'])
yelp_data = pd.DataFrame(read_data('sentiment_labelled_sentences/yelp_labelled.txt'), columns=['sentence', 'score'])

In [4]:
print amazon_data['score'].sum()
print imdb_data['score'].sum()
print yelp_data['score'].sum()

500
500
500


In [5]:
dataset = pd.concat([amazon_data, imdb_data, yelp_data], keys=['amazon', 'imdb', 'yelp'])

# (b) Preprocessing

In [6]:
import string
import contractions
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def preprocess(sentence):
    sentence = contractions.fix(sentence)  # expand contractions
    sentence = sentence.strip().lower().translate(None, string.punctuation)  # lowercase and strip punctuations
    sentence = re.sub(r'\d+', '', sentence)  # remove numbers
    stop_words = set(stopwords.words('english')) 
    words = word_tokenize(sentence)
    wnl = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')
    words = [wnl.lemmatize(stemmer.stem(word.strip().decode('utf-8'))) for word in words if word not in stop_words] # remove stopwords, stemming and lemmatization
    return ' '.join(words)

[nltk_data] Downloading package stopwords to /Users/lifei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lifei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lifei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
dataset['sentence'] = map(preprocess, dataset['sentence'])

# (c) Split training and testing set

In [8]:
training_set = pd.concat([dataset.loc['amazon'].loc[dataset.loc['amazon']['score'] == 0].iloc[:400],
                          dataset.loc['amazon'].loc[dataset.loc['amazon']['score'] == 1].iloc[:400],
                          dataset.loc['imdb'].loc[dataset.loc['imdb']['score'] == 0].iloc[:400],
                          dataset.loc['imdb'].loc[dataset.loc['imdb']['score'] == 1].iloc[:400],
                          dataset.loc['yelp'].loc[dataset.loc['yelp']['score'] == 0].iloc[:400],
                          dataset.loc['yelp'].loc[dataset.loc['yelp']['score'] == 1].iloc[:400]])
testing_set = pd.concat([dataset.loc['amazon'].loc[dataset.loc['amazon']['score'] == 0].iloc[400:],
                          dataset.loc['amazon'].loc[dataset.loc['amazon']['score'] == 1].iloc[400:],
                          dataset.loc['imdb'].loc[dataset.loc['imdb']['score'] == 0].iloc[400:],
                          dataset.loc['imdb'].loc[dataset.loc['imdb']['score'] == 1].iloc[400:],
                          dataset.loc['yelp'].loc[dataset.loc['yelp']['score'] == 0].iloc[400:],
                          dataset.loc['yelp'].loc[dataset.loc['yelp']['score'] == 1].iloc[400:]])

# (d) Bag of Words model

In [9]:
def generate_BoW_dict(training_set):
    BoW_dict = {}
    tot = 0
    for index, row in training_set.iterrows():
        for word in row['sentence'].split(' '):
            if word not in BoW_dict:
                BoW_dict[word] = tot
                tot += 1
    print 'BoW_dict size: %d' % tot
    return BoW_dict

BoW_dict = generate_BoW_dict(training_set)

def generate_BoW_feature_vector(sentence):
    feature_vector = [0] * len(BoW_dict)
    for word in sentence.split(' '):
        if word in BoW_dict:
            feature_vector[BoW_dict[word]] += 1
    return feature_vector

training_set['feature_vector'] = training_set['sentence'].apply(generate_BoW_feature_vector)
testing_set['feature_vector'] = testing_set['sentence'].apply(generate_BoW_feature_vector)

BoW_dict size: 3504


In [10]:
print training_set.iloc[0]
print training_set.iloc[1]

sentence                               way plug u unless go convert
score                                                             0
feature_vector    [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object
sentence              tie charger convers last minutesmajor problem
score                                                             0
feature_vector    [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...
Name: 3, dtype: object


# (e) Postprocessing

In [11]:
import numpy as np
from numpy.linalg import norm
import math
from sklearn.preprocessing import StandardScaler

def log_norm(feature_vector):
    # log normalization
    return map(lambda x: math.log(x + 1.0), feature_vector)

def l1_norm(feature_vector):
    # l1 normalization
    vector_sum = sum(feature_vector)
    if vector_sum == 0:
        return map(float, feature_vector)
    return map(lambda x: x / float(vector_sum), feature_vector)

def l2_norm(feature_vector):
    # l2 normalization
    vector_l2_norm = norm(feature_vector)
    if vector_l2_norm == 0:
        return map(float, feature_vector)
    return map(lambda x: x / float(vector_l2_norm), feature_vector)

training_set['feature_vector'] = training_set['feature_vector'].apply(log_norm)
testing_set['feature_vector'] = testing_set['feature_vector'].apply(log_norm)

In [12]:
print training_set.iloc[0]
print training_set.iloc[1]

sentence                               way plug u unless go convert
score                                                             0
feature_vector    [0.69314718056, 0.69314718056, 0.69314718056, ...
Name: 0, dtype: object
sentence              tie charger convers last minutesmajor problem
score                                                             0
feature_vector    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.69314718056, ...
Name: 3, dtype: object


# (f) Sentiment prediction

## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
X_train = np.array(training_set['feature_vector'].values.tolist())
y_train = training_set['score'].astype(int)
X_test = np.array(testing_set['feature_vector'].values.tolist())
y_test = testing_set['score'].astype(int)
clf_log = LogisticRegression()
clf_log.fit(X_train, y_train)
acc_log = clf_log.score(X_test, y_test)
print 'Logistic Regression ACC: %.4f' % acc_log

Logistic Regression ACC: 0.8150


In [15]:
def generate_words_importance(coef, BoW_dict):
    importance_dict = {}
    for word in BoW_dict:
        importance_dict[word] = coef[BoW_dict[word]]
    return importance_dict

def print_top_20(importance_dict):
    sorted_importance_dict = sorted(importance_dict.items(), key=lambda kv: kv[1])
    print 'Negative top 20:'
    print "\n".join(map(lambda x: ', '.join(map(str, x)), sorted_importance_dict[:20]))
    print ''
    print 'Positive top 20:'
    print "\n".join(map(lambda x: ', '.join(map(str, x)), sorted_importance_dict[-1:-21:-1]))

In [16]:
print_top_20(generate_words_importance(clf_log.coef_[0], BoW_dict))

Negative top 20:
bad, -2.940010177795963
poor, -2.5000662875487514
worst, -2.1129273163378746
terribl, -1.9187343968442017
wast, -1.8419838593632043
slow, -1.69722945329617
suck, -1.652177338313711
aw, -1.6272464373236337
disappoint, -1.6269823601582507
horribl, -1.4969783123327889
stupid, -1.4604472244941435
start, -1.4338083724143165
bland, -1.4196933573475197
fail, -1.3418801400134233
piec, -1.341395709404752
plot, -1.3377730434028574
rude, -1.3156337005568535
avoid, -1.2928377131075706
hear, -1.2855568225651768
hate, -1.2530753973876092

Positive top 20:
great, 3.728217853875945
love, 3.1307304894746175
excel, 2.543477888360755
delici, 2.3579199440102383
nice, 2.2610563814106817
amaz, 2.1328363185656607
fantast, 2.0145059778780205
beauti, 1.9685936229207004
awesom, 1.9095860263657145
best, 1.887332235856134
good, 1.8841164784088518
perfect, 1.7982309244889034
comfort, 1.7010272166626892
wonder, 1.5434490656079216
well, 1.5178364156388924
happi, 1.4414300528592643
incred, 1.42822049

## Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

In [18]:
gnb = GaussianNB()
bnb = BernoulliNB()
gnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)
acc_gnb = gnb.score(X_test, y_test)
acc_bnb = bnb.score(X_test, y_test)
print 'Gaussian Naive Bayes ACC: %.4f' % acc_gnb
print 'Bernoulli Naive Bayes ACC: %.4f' % acc_bnb

Gaussian Naive Bayes ACC: 0.6317
Bernoulli Naive Bayes ACC: 0.8050


# (g) N-gram model

In [19]:
def generate_BoW_dict_2_gram(training_set):
    BoW_dict_2_gram = {}
    tot = 0
    for index, row in training_set.iterrows():
        words = row['sentence'].split(' ')
        for i in range(len(words) - 1):
            gram = words[i] + ' ' + words[i + 1]
            if gram not in BoW_dict_2_gram:
                BoW_dict_2_gram[gram] = tot
                tot += 1
    print 'BoW_dict_2_gram size: %d' % tot
    return BoW_dict_2_gram

BoW_dict_2_gram = generate_BoW_dict_2_gram(training_set)

def generate_BoW_2_gram_feature_vector(sentence):
    feature_vector = [0] * len(BoW_dict_2_gram)
    words = sentence.split(' ')
    for i in range(len(words) - 1):
        gram = words[i] + ' ' + words[i + 1]
        if gram in BoW_dict_2_gram:
            feature_vector[BoW_dict_2_gram[gram]] += 1
    return feature_vector

training_set['2_gram_feature_vector'] = training_set['sentence'].apply(generate_BoW_2_gram_feature_vector)
testing_set['2_gram_feature_vector'] = testing_set['sentence'].apply(generate_BoW_2_gram_feature_vector)

BoW_dict_2_gram size: 10789


In [20]:
print training_set.iloc[0]
print training_set.iloc[1]

sentence                                      way plug u unless go convert
score                                                                    0
feature_vector           [0.69314718056, 0.69314718056, 0.69314718056, ...
2_gram_feature_vector    [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object
sentence                     tie charger convers last minutesmajor problem
score                                                                    0
feature_vector           [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.69314718056, ...
2_gram_feature_vector    [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
Name: 3, dtype: object


In [21]:
training_set['2_gram_feature_vector'] = training_set['2_gram_feature_vector'].apply(log_norm)
testing_set['2_gram_feature_vector'] = testing_set['2_gram_feature_vector'].apply(log_norm)

In [22]:
print training_set.iloc[0]
print training_set.iloc[1]

sentence                                      way plug u unless go convert
score                                                                    0
feature_vector           [0.69314718056, 0.69314718056, 0.69314718056, ...
2_gram_feature_vector    [0.69314718056, 0.69314718056, 0.69314718056, ...
Name: 0, dtype: object
sentence                     tie charger convers last minutesmajor problem
score                                                                    0
feature_vector           [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.69314718056, ...
2_gram_feature_vector    [0.0, 0.0, 0.0, 0.0, 0.0, 0.69314718056, 0.693...
Name: 3, dtype: object


In [23]:
X_train_2_gram = np.array(training_set['2_gram_feature_vector'].values.tolist())
y_train_2_gram = training_set['score'].astype(int)
X_test_2_gram = np.array(testing_set['2_gram_feature_vector'].values.tolist())
y_test_2_gram = testing_set['score'].astype(int)
clf_log = LogisticRegression()
clf_log.fit(X_train_2_gram, y_train_2_gram)
acc_log = clf_log.score(X_test_2_gram, y_test_2_gram)
print 'Logistic Regression ACC: %.4f' % acc_log

Logistic Regression ACC: 0.6433


In [24]:
print_top_20(generate_words_importance(clf_log.coef_[0], BoW_dict_2_gram))

Negative top 20:
wast time, -1.6266337663157204
wast money, -1.2108755181402313
custom servic, -0.885446698216792
poor qualiti, -0.8081575997816369
stay away, -0.80332538188543
piec junk, -0.7777574898399167
worst ever, -0.7631486261640384
bad film, -0.7557103236968924
realli bad, -0.7313689428304294
wait wait, -0.7110396622789038
good way, -0.7076382278339408
make mistak, -0.7071255012510449
buy product, -0.7048544951320117
ever go, -0.6881550537396549
zero star, -0.6663238958811999
act bad, -0.6662888020728832
go back, -0.6625560941137096
anytim soon, -0.6614309848132359
look good, -0.6601001979350706
send back, -0.6530665613641474

Positive top 20:
work great, 2.054405466821827
high recommend, 1.7524930364450328
one best, 1.4582411255582608
great phone, 1.289089164994326
great product, 1.1872160667930691
food good, 1.0734850744325828
realli good, 1.0679433092586532
easi use, 1.009555329850475
great food, 0.9843961434373667
reason price, 0.9029096380166195
food delici, 0.899918138855

In [25]:
gnb = GaussianNB()
bnb = BernoulliNB()
gnb.fit(X_train_2_gram, y_train_2_gram)
bnb.fit(X_train_2_gram, y_train_2_gram)
acc_gnb = gnb.score(X_test_2_gram, y_test_2_gram)
acc_bnb = bnb.score(X_test_2_gram, y_test_2_gram)
print 'Gaussian Naive Bayes ACC: %.4f' % acc_gnb
print 'Bernoulli Naive Bayes ACC: %.4f' % acc_bnb

Gaussian Naive Bayes ACC: 0.6300
Bernoulli Naive Bayes ACC: 0.6400


# (h) PCA

In [26]:
# reference: https://en.wikipedia.org/wiki/Principal_component_analysis#Computing_PCA_using_the_covariance_method

def pca(X, d):
    # X is a matrix (n examples * p features), reduce to d = [d1, d2, ...] dimensions
    X = np.array(X)
    n, p = X.shape
    
    # Calculate emperical mean
    e_mean = X.mean(axis=0)
    
    # Subtract emperical mean
    B = X - e_mean
    
    # Calculate covariance matrix
    C = B.T.dot(B) / (n - 1)
    
    # Find eigenvectors and eigenvalues of C
    e_values, e_vectors = np.linalg.eig(C)
    
    # Rearrange the eigenvectors and eigenvalues
    idx = np.argsort(e_values)[::-1]
    e_values = e_values[idx]
    e_vectors = e_vectors[:, idx]
    
    return map(lambda dd: (B.dot(e_vectors[:, :dd]), e_mean, e_values[:dd], e_vectors[:, :dd]), d)

## 1-gram

In [27]:
X_train_pca_results = pca(X_train, [10, 50, 100])

In [28]:
X_train_pca = map(lambda x: x[0].astype(float), X_train_pca_results)
X_test_pca = map(lambda x: (X_test - x[1]).dot(x[3]).astype(float), X_train_pca_results)

  """Entry point for launching an IPython kernel.
  


### Logistic Regression

In [29]:
def LR(X_train, X_test, y_train, y_test):
    clf_log = LogisticRegression()
    clf_log.fit(X_train, y_train)
    acc_log = clf_log.score(X_test, y_test)
    print 'Logistic Regression ACC: %.4f' % acc_log
    return acc_log

import functools
acc_log_10, acc_log_50, acc_log_100 = map(functools.partial(LR, y_train=y_train, y_test=y_test), X_train_pca, X_test_pca)

Logistic Regression ACC: 0.5867
Logistic Regression ACC: 0.6983
Logistic Regression ACC: 0.7333


### Naive Bayes

In [30]:
def NB(X_train, X_test, y_train, y_test):
    gnb = GaussianNB()
    bnb = BernoulliNB()
    gnb.fit(X_train, y_train)
    bnb.fit(X_train, y_train)
    acc_gnb = gnb.score(X_test, y_test)
    acc_bnb = bnb.score(X_test, y_test)
    print 'Gaussian Naive Bayes ACC: %.4f' % acc_gnb
    print 'Bernoulli Naive Bayes ACC: %.4f' % acc_bnb
    return acc_gnb, acc_bnb

import functools
acc_NB_10, acc_NB_50, acc_NB_100 = map(functools.partial(NB, y_train=y_train, y_test=y_test), X_train_pca, X_test_pca)

Gaussian Naive Bayes ACC: 0.5867
Bernoulli Naive Bayes ACC: 0.5650
Gaussian Naive Bayes ACC: 0.6300
Bernoulli Naive Bayes ACC: 0.6233
Gaussian Naive Bayes ACC: 0.6383
Bernoulli Naive Bayes ACC: 0.6650


## 2-gram

In [31]:
X_train_2_gram_pca_results = pca(X_train_2_gram, [10, 50, 100])

In [32]:
X_train_2_gram_pca = map(lambda x: x[0].astype(float), X_train_2_gram_pca_results)
X_test_2_gram_pca = map(lambda x: (X_test_2_gram - x[1]).dot(x[3]).astype(float), X_train_2_gram_pca_results)

  """Entry point for launching an IPython kernel.
  


### Logistic Regression

In [33]:
acc_log_10, acc_log_50, acc_log_100 = map(functools.partial(LR, y_train=y_train_2_gram, y_test=y_test_2_gram), X_train_2_gram_pca, X_test_2_gram_pca)

Logistic Regression ACC: 0.5050
Logistic Regression ACC: 0.5267
Logistic Regression ACC: 0.5367


### Naive Bayes

In [34]:
acc_NB_10, acc_NB_50, acc_NB_100 = map(functools.partial(NB, y_train=y_train_2_gram, y_test=y_test_2_gram), X_train_2_gram_pca, X_test_2_gram_pca)

Gaussian Naive Bayes ACC: 0.4967
Bernoulli Naive Bayes ACC: 0.5033
Gaussian Naive Bayes ACC: 0.5067
Bernoulli Naive Bayes ACC: 0.5150
Gaussian Naive Bayes ACC: 0.5133
Bernoulli Naive Bayes ACC: 0.5283
