# Jingyuan Zhou
# Assignment 2: Text Classification
# Implementation/Experimentation with Linear Text Classifiers

In [1]:
import pandas as pd
import numpy as np
import string
from scipy.sparse import lil_matrix, coo_matrix, csr_matrix

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [55]:
from collections import Counter

In [2]:
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

In [3]:
train = pd.read_table('sst3.train', header = None)
train.columns = ['text', 'label']

dev = pd.read_table('sst3.dev', header = None)
dev.columns = ['text', 'label']

devtest = pd.read_table('sst3.devtest', header = None)
devtest.columns = ['text', 'label']

In [45]:
def process(t, mode = 1):
#     l = t.lower().translate(string.maketrans("",""), string.punctuation).strip().split()
#     temp = [WordNetLemmatizer().lemmatize(x) for x in l if x != '' and x not in stopwords]
    if mode == 1:
        l = t.strip().split()
    elif mode == 2:
        l = t.lower().translate(string.maketrans("",""), string.punctuation).strip().split()
        l = [WordNetLemmatizer().lemmatize(x) for x in l if x != '' and x not in stopwords]

    return l

def process_df(df, mode = 1):

    df['tokens'] = df['text'].apply(lambda x: process(x, mode), 1)
    df.reset_index(inplace = True)


## 1.1 Building a Linear Classifier

In [5]:
process_df(train)
process_df(dev)
process_df(devtest)

In [110]:
# at first, get all the features
def populate_feature(l, features):
#     i = l['index']
    words = l['tokens']
    label = l['label']
    for w in words:
        if (w, label) not in features:
            features.add((w, label))
               
def score(word, y, f_weight_dic):
    try:
        return f_weight_dic[(word, y)]
    except:
        return 0
    
def classify_p(tokens, f_weight_dic):
    scores = []
    for y in range(3):
        s = 0
        for word in tokens:
            s += score(word, y, f_weight_dic)
        scores.append(s)
    return np.argmax(scores)
    

def classify_h(tokens, label, f_weight_dic):
    scores = []
    for y in range(3):
        s = 0
        for word in tokens:
            s += score(word, y, f_weight_dic) 
        s += (y != label)
        scores.append(s)
    return np.argmax(scores)


def test(df, f_weight_dic):
    acc = 0
    for i in range(len(df)):
        line = df.iloc[i,:]
        label = line['label']
        tokens = line['tokens']
        acc += (label == classify_p(tokens, f_weight_dic))

    r = acc*100/float(len(df))
    return r

def ssd(loss, features, data = train, score = False):
    best_score_dev = 0
    best_score_devtest = 0
    f_weight_dic = dict.fromkeys(features, 0)

    for epoch in range(1, 21):
        for i in range(len(train)):
            line = train.iloc[i,:]
            label = line['label']
            tokens = set(line['tokens'])
            
            if loss == 'perceptron':
                res = classify_p(tokens, f_weight_dic)
            else:
                res = classify_h(tokens, label, f_weight_dic)            
            
            for word in tokens:
                if (word, label) in features:
                    if score:
                        s = feature_dic[(word, label)]
                    else:
                        s = 1
                    f_weight_dic[(word, label)] += 0.01*s
                if (word, res) in features:
                    if score:
                        s = feature_dic[(word, label)]
                    else:
                        s = 1                    
                    f_weight_dic[(word, res)] -= 0.01*s
                            
            if i%20000 == 0 and i!= 0:
                res = test(dev, f_weight_dic)
                if res > best_score_dev:
                    print 'Epoch %d position %d ' % (epoch, i)
                    best_score_dev = res
                    best_w = f_weight_dic
                    
                    res_test = test(devtest, f_weight_dic)
                    if res_test > best_score_devtest:
                        best_score_devtest = res_test
                    print 'Result on devtest: '+ format(res_test, '.2f')+'%'
        
        print '-'*40
        print 'End of epoch '+ str(epoch)
        print 'Result on dev: '
        res = test(dev, f_weight_dic)
        print format(res, '.2f')+'%'
        
        if res > best_score_dev:
            best_score_dev = res
            best_w = f_weight_dic
            res_test = test(devtest, f_weight_dic)
            if res_test > best_score_devtest:
                    best_score_devtest = res_test
            print 'Result on devtest: '+ format(res_test, '.2f')+'%'
        print '='*40
    print 'Highest accuracy on dev ' + format(best_score_dev, '.2f')+'%'
    print 'Highest accuracy on devtest ' + format(best_score_devtest, '.2f')+'%'
    return best_w

In [8]:
features = set()
discard = train.apply(lambda x: populate_feature(x, features), 1)
len(features)

36964

In [9]:
best_w = ssd('perceptron', features = features)

Epoch 1 position 20000 
Result on devtest: 41.56%
Epoch 1 position 40000 
Result on devtest: 42.47%
Epoch 1 position 60000 
Result on devtest: 46.28%
Epoch 1 position 100000 
Result on devtest: 49.18%
Epoch 1 position 120000 
Result on devtest: 56.62%
----------------------------------------
End of epoch 1
Result on dev: 
56.36%
----------------------------------------
End of epoch 2
Result on dev: 
56.55%
Epoch 3 position 120000 
Result on devtest: 54.63%
----------------------------------------
End of epoch 3
Result on dev: 
59.82%
----------------------------------------
End of epoch 4
Result on dev: 
55.64%
----------------------------------------
End of epoch 5
Result on dev: 
59.64%
----------------------------------------
End of epoch 6
Result on dev: 
58.00%
----------------------------------------
End of epoch 7
Result on dev: 
59.27%
----------------------------------------
End of epoch 8
Result on dev: 
58.73%
----------------------------------------
End of epoch 9
Result on

## 1.2 Hinge Loss

In [28]:
best_w_h = ssd('h', features = features)

Epoch 1 position 20000 
Result on devtest: 40.11%
Epoch 1 position 40000 
Result on devtest: 41.38%
Epoch 1 position 60000 
Result on devtest: 43.01%
Epoch 1 position 100000 
Result on devtest: 47.01%
Epoch 1 position 120000 
Result on devtest: 53.36%
----------------------------------------
End of epoch 1
Result on dev: 
51.45%
Epoch 2 position 120000 
Result on devtest: 56.62%
----------------------------------------
End of epoch 2
Result on dev: 
57.09%
Epoch 3 position 120000 
Result on devtest: 57.35%
----------------------------------------
End of epoch 3
Result on dev: 
61.27%
----------------------------------------
End of epoch 4
Result on dev: 
61.82%
Epoch 5 position 120000 
Result on devtest: 58.26%
----------------------------------------
End of epoch 5
Result on dev: 
62.73%
----------------------------------------
End of epoch 6
Result on dev: 
63.64%
----------------------------------------
End of epoch 7
Result on dev: 
64.36%
----------------------------------------
E

## 1.3 Feature Weight Analysis

In [29]:
d = {'words': [a[0] for a in best_w_h.keys()], 'label': [a[1] for a in best_w_h.keys()],\
     'weight': best_w_h.values()}
f_w_df = pd.DataFrame(d)

In [30]:
f_w_df[f_w_df['label'] == 0].sort_values(by = 'weight', ascending = False).head(10)

Unnamed: 0,label,weight,words
26393,0,1.88,listless
1696,0,1.86,disingenuous
20318,0,1.8,elsewhere
11594,0,1.79,pity
10845,0,1.78,Lacks
19193,0,1.73,lacking
36872,0,1.7,lacks
29457,0,1.7,hardly
1226,0,1.69,tiresome
10556,0,1.68,poorly


In [31]:
f_w_df[f_w_df['label'] == 1].sort_values(by = 'weight', ascending = False).head(10)

Unnamed: 0,label,weight,words
13842,1,0.81,Warren
23804,1,0.8,Pete
30880,1,0.8,1950s
14599,1,0.78,Troopers
19445,1,0.75,Bartlett
3922,1,0.75,anarchic
35099,1,0.71,Deuces
26950,1,0.7,versus
10281,1,0.7,stake
17081,1,0.69,'90s


In [32]:
f_w_df[f_w_df['label'] == 2].sort_values(by = 'weight', ascending = False).head(10)

Unnamed: 0,label,weight,words
35030,2,2.3,vividly
25658,2,2.06,pleasant
33766,2,1.87,wonderfully
15520,2,1.86,thought-provoking
8329,2,1.67,gorgeously
25967,2,1.67,treat
16584,2,1.59,touching
10349,2,1.59,miracle
6894,2,1.58,delicious
28741,2,1.58,moved


- We could see that in the top features with highest weight for label 0, there are words of different forms. E.g. 'Lacks', 'lacking','lacks'. They should have been treated as one word.

- I think both negative and positive words are pretty accurate. 

- Another interesting pattern is that both top positive and negative words are adjectives, and the top neutral words are mostly nouns.

## 1.4 Error Analysis 

In [36]:
pred = []
devtest_pred = devtest.copy()
for i in range(len(devtest)):
    line = devtest_pred.iloc[i,:]
    label = line['label']
    tokens = line['tokens']
    pred.append(classify_p(tokens, best_w_h))
devtest_pred['pred'] = pred
tmp = devtest_pred[devtest_pred['pred'] != devtest_pred['label']]

In [130]:
cats = ['Lacks sentimental adjectives', 'ambiguous sentiment - could be either neutral and positive',\
 'negation', 'incorrect gold standard annotation', 'Lacks sentimental adjectives', \
 'negation/challenging sentence structure', 'Lacks sentimental adjectives', 'metaphor/challenging sentence structure',\
 'ambiguous', 'challenging sentence structure/negative adj used to describe characters not the movie',\
 'challenging sentence structure/adj used to describe characters not the movie', 'incorrect gold standard annotation',\
 'challenging sentence structure', 'unseen word', 'challenging sentence structure/adj used to describe characters not the movie', \
 'ambiguous comment', 'challenging sentence structure/stunned meant more positive here', 'challenging sentence structure',\
 'challenging sentence structure/negative adjectives meant well', 'negation'
]

In [132]:
for i in range(20):
    t = tmp.iloc[i, 1]
    l = tmp.iloc[i, 2]
    p = tmp.iloc[i, 4]
#     cat = cats[i]
    
    print 'Text: '+t
    print 'Gold standard label: '+ str(l)
    print 'Prediction: '+ str(p)
#     print 'Category: '+cat
    print('\n')

Text: Though only 60 minutes long , the film is packed with information and impressions .
Gold standard label: 2
Prediction: 0


Text: There 's a solid woman - finding-herself story somewhere in here , but you 'd have to dig pretty deep to uncover it .
Gold standard label: 1
Prediction: 2


Text: A great ensemble cast ca n't lift this heartfelt enterprise out of the familiar .
Gold standard label: 0
Prediction: 2


Text: There 's just no currency in deriding James Bond for being a clichéd , doddering , misogynistic boy 's club .
Gold standard label: 1
Prediction: 0


Text: Remember the kind of movie we were hoping `` Ecks vs. Sever '' or `` xXx '' was going to be ?
Gold standard label: 1
Prediction: 0


Text: Inside the film 's conflict-powered plot there is a decent moral trying to get out , but it 's not that , it 's the tension that keeps you in your seat .
Gold standard label: 2
Prediction: 0


Text: Under 15 ?
Gold standard label: 1
Prediction: 2


Text: To say this was done bette

## 1.5 Feature Engineering

### 1.5.1 New feature template 1: lowercase words, remove punctuation and lemmatize text

In [46]:
train = pd.read_table('sst3.train', header = None)
train.columns = ['text', 'label']

dev = pd.read_table('sst3.dev', header = None)
dev.columns = ['text', 'label']

devtest = pd.read_table('sst3.devtest', header = None)
devtest.columns = ['text', 'label'] 


In [47]:
process_df(train, 2)
process_df(dev, 2)
process_df(devtest, 2)

In [49]:
features_1 = set()
discard = train.apply(lambda x: populate_feature(x, features_1), 1)
len(features_1)

30933

In [80]:
w_t1 = ssd('h', features = features_1)

Epoch 1 position 20000 
Result on devtest: 41.02%
Epoch 1 position 40000 
Result on devtest: 41.92%
Epoch 1 position 60000 
Result on devtest: 44.10%
Epoch 1 position 80000 
Result on devtest: 56.81%
----------------------------------------
End of epoch 1
Result on dev: 
62.55%
Epoch 2 position 80000 
Result on devtest: 58.62%
----------------------------------------
End of epoch 2
Result on dev: 
64.73%
Epoch 3 position 80000 
Result on devtest: 59.71%
----------------------------------------
End of epoch 3
Result on dev: 
65.45%
----------------------------------------
End of epoch 4
Result on dev: 
65.45%
Epoch 5 position 100000 
Result on devtest: 60.80%
----------------------------------------
End of epoch 5
Result on dev: 
65.82%
Epoch 6 position 100000 
Result on devtest: 60.98%
----------------------------------------
End of epoch 6
Result on dev: 
66.36%
----------------------------------------
End of epoch 7
Result on dev: 
66.73%
Result on devtest: 61.16%
-------------------

This feature template improves 1% accuracy on devtest at 1%.

### 1.5.1 New feature template 2: lowercase words, remove punctuation and lemmatize text + use (number of times the feature pair shows up)/(number of times the word shows up) as feature values (instead of 1 or 0)

In [81]:
def populate_feature_count(l, d):
#     i = l['index']
    words = l['tokens']
    label = l['label']
    for w in words:
        if (w, label) not in d:
            d[(w, label)] = 1
        else:
            d[(w, label)] += 1

In [102]:
def get_score(x):
    w = x['feature'][0]
    c = x['count']
    t = sum(fc_df[fc_df['feature'].apply(lambda x: x[0]== w)]['count'])
    return float(c)/t

In [82]:
feature_count_dic = {}
discard = train.apply(lambda x: populate_feature_count(x, feature_count_dic), 1)
len(feature_count_dic)

30933

In [85]:
fc_df = pd.DataFrame({'feature': feature_count_dic.keys(), 'count': feature_count_dic.values()})
fc_df['score'] = fc_df.apply(lambda x: get_score(x), 1)
feature_dic = dict(zip(fc_df['feature'], fc_df['score']))

In [112]:
w_t2 = ssd('h', features = features, score = True)

Epoch 1 position 20000 
Result on devtest: 42.11%
Epoch 1 position 40000 
Result on devtest: 41.92%
Epoch 1 position 60000 
Result on devtest: 43.56%
Epoch 1 position 80000 
Result on devtest: 56.08%
----------------------------------------
End of epoch 1
Result on dev: 
62.55%
Epoch 2 position 80000 
Result on devtest: 58.80%
Epoch 2 position 100000 
Result on devtest: 56.99%
----------------------------------------
End of epoch 2
Result on dev: 
64.73%
Result on devtest: 57.53%
Epoch 3 position 100000 
Result on devtest: 58.44%
Epoch 3 position 120000 
Result on devtest: 57.35%
----------------------------------------
End of epoch 3
Result on dev: 
65.09%
Epoch 4 position 120000 
Result on devtest: 57.53%
----------------------------------------
End of epoch 4
Result on dev: 
65.45%
Epoch 5 position 120000 
Result on devtest: 58.26%
----------------------------------------
End of epoch 5
Result on dev: 
65.82%
Result on devtest: 58.44%
----------------------------------------
End of 

This feature template has similar results as the original one.