In [364]:
import os
import re
from string import punctuation
import numpy as np
import pandas as pd
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

In [373]:
pos_dir = 'train/pos'
neg_dir = 'train/neg'
test_dir = 'test'
pos_fs = os.listdir(pos_dir)
pos_fs = list(filter(lambda x: '.txt' in x, pos_fs))
neg_fs = os.listdir(neg_dir)
neg_fs = list(filter(lambda x: '.txt' in x, neg_fs))
test_fs = os.listdir(test_dir)
test_fs = list(filter(lambda x: '.txt' in x, test_fs))

pos_n = len(pos_fs)
neg_n = len(neg_fs)
test_n = len(test_fs)
print(pos_n, neg_n, test_n)

12500 12500 11000


In [308]:
trainX, trainY = None, None

# Fetch rates
rates_pos = np.array(list(map(lambda x: int(re.search('_([0-9]+).txt', x).group(1)), pos_fs)))
rates_neg = np.array(list(map(lambda x: int(re.search('_([0-9]+).txt', x).group(1)), neg_fs)))
max_gram = 3

# Stats
ngram_pos = [[] for i in range(max_gram)]
ngram_neg = [[] for i in range(max_gram)]
ngram_freq = [{} for i in range(max_gram)]
ngram_pos_freq = [{} for i in range(max_gram)]
ngram_neg_freq = [{} for i in range(max_gram)]
ngram_ratio = [{} for i in range(max_gram)]
ngram_idx = [{} for i in range(max_gram)]
ngram_weight = [{} for i in range(max_gram)]

In [457]:
# Filters
stop_words = set(stopwords.words('english'))
stop_words.remove('but')
puncs_rm = '\'"#$!%&()*+,-./:;<=>@[\\]^_`{|}~'

# Filter by frequency range
ngram_low = [10, 9, 8]
ngram_high = [12500, 6250, 3125]

# Filter by ratio >= #(w in pos)/#(w in neg) or #(w in neg)/#(w in pos)
ngram_ratio_threshold = [2, 2, 3]

In [310]:
# Word preprocessing
word_lemmatizer = WordNetLemmatizer()


def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']


def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None


def lemmatize(word_lemmatizer, tokens):
    pos_tagged = pos_tag(tokens)
    wns = list(map(lambda x: penn_to_wn(x[1]), pos_tagged))
    lemm = list(map(lambda args: word_lemmatizer.lemmatize(args[0]) if args[1] is None 
                    else word_lemmatizer.lemmatize(args[0], args[1]), zip(tokens, wns)))
    return lemm

In [311]:
print(np.unique(rates_pos, return_counts=True))
print(np.unique(rates_neg, return_counts=True))

(array([ 7,  8,  9, 10]), array([2496, 3009, 2263, 4732]))
(array([1, 2, 3, 4]), array([5100, 2284, 2420, 2696]))


In [312]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

    
def is_punc(s):
    is_punc = True
    for c in s:
        if c not in punctuation:
            return False
    return True 


def sent_to_tokens(sent, stop_words, puncs_rm, tokens=None):
    if not tokens:
        tokens = word_tokenize(sent.strip().lower())
    res = []
    for t in tokens:
        if is_number(t) or t.isdigit():
            continue
        if t in stop_words or is_punc(t):
            continue
        res.append(t)
    return res


def sent_to_ngrams(sent, n, stop_words, puncs_rm, tokens=None):
    if not tokens:
        tokens = word_tokenize(sent.strip().lower())
    ngrams = []
    for i in range(len(tokens)-n+1):
        use = False
        ngram = tokens[i:i+n]
        for g in ngram:
            if g not in stop_words and not is_punc(g):
                use = True
            if is_punc(g) or is_number(g) or g.isdigit():
                use = False
                break
        if use:
            ngrams.append(" ".join(ngram))
    return ngrams


def process_sent(sent, max_gram):
    # Preprocess sentence
    # Remove html tags
    sent = re.sub(r'<[a-zA-Z\s]*/>', ' ', sent)
    res = []
    tokens = word_tokenize(sent.strip().lower())
    tokens = lemmatize(word_lemmatizer, tokens)
    res.append(sent_to_tokens(sent, stop_words, puncs_rm, tokens=tokens))
    for n in range(2, max_gram+1):
        res.append(sent_to_ngrams(sent, n, stop_words, puncs_rm, tokens=tokens))
    return res

In [458]:
tokens = word_tokenize('We are coming!')
lemmatize(word_lemmatizer, tokens)

['We', 'be', 'come', '!']

In [314]:
process_sent("""Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, 
such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much 
closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right 
through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and
their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately 
recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. 
STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. 
What a pity that it isn't!""", 1)

[['bromwell',
  'high',
  'cartoon',
  'comedy',
  'run',
  'time',
  'program',
  'school',
  'life',
  'teacher',
  'year',
  'teaching',
  'profession',
  'lead',
  'believe',
  'bromwell',
  'high',
  "'s",
  'satire',
  'much',
  'closer',
  'reality',
  'teacher',
  'scramble',
  'survive',
  'financially',
  'insightful',
  'student',
  'see',
  'right',
  'pathetic',
  'teacher',
  'pomp',
  'pettiness',
  'whole',
  'situation',
  'remind',
  'school',
  'knew',
  'student',
  'saw',
  'episode',
  'student',
  'repeatedly',
  'try',
  'burn',
  'school',
  'immediately',
  'recall',
  'high',
  'classic',
  'line',
  'inspector',
  "'m",
  'sack',
  'one',
  'teacher',
  'student',
  'welcome',
  'bromwell',
  'high',
  'expect',
  'many',
  'adult',
  'age',
  'think',
  'bromwell',
  'high',
  'far',
  'fetch',
  'pity',
  "n't"]]

In [315]:
def add_freq(grams, n, is_pos):
    gram_freq = ngram_freq[n-1]
    gram_pos_freq = ngram_pos_freq[n-1]
    gram_neg_freq = ngram_neg_freq[n-1]
    for g in grams:
        if g not in gram_freq:
            gram_freq[g] = 1
        else:
            gram_freq[g] += 1
        if is_pos:
            if g not in gram_pos_freq:
                gram_pos_freq[g] = 1
            else:
                gram_pos_freq[g] += 1
        else:
            if g not in gram_neg_freq:
                gram_neg_freq[g] = 1
            else:
                gram_neg_freq[g] += 1
    
    
def process_files(files, dir, is_pos):
    for file in files:
        with open(os.path.join(dir, file), 'r') as f:
            review = f.readline()
            res = process_sent(review, max_gram)
            for i in range(max_gram):
                if is_pos:
                    ngram_pos[i].append(res[i])
                    add_freq(res[i], i+1, is_pos)
                else:
                    ngram_neg[i].append(res[i])
                    add_freq(res[i], i+1, is_pos)

In [316]:
process_files(pos_fs, pos_dir, True)
process_files(neg_fs, neg_dir, False)

In [406]:
def sort(dict, max):
    return list(sorted(dict.items(), key=lambda item: -item[1]))[:max]

print(sort(ngram_pos_freq[1], 10))
print(sort(ngram_neg_freq[1], 10))

[("it 's", 8503), ('the film', 7362), ("do n't", 6889), ('this movie', 6680), ('this film', 5364), ('the movie', 5361), ('one of', 4789), ('film be', 3199), ('movie be', 3161), ('the story', 3125)]
[("do n't", 10364), ('this movie', 8904), ("it 's", 8509), ('the movie', 6769), ('the film', 6334), ('this film', 5462), ('movie be', 4442), ("be n't", 4189), ('one of', 3276), ('film be', 3098)]


In [339]:
def compute_ratio(freq, pos_freq, neg_freq):
    pn, nn, gram_ratio = 0, 0, {}
    for w in freq:
        pn = pos_freq[w] if w in pos_freq else 1
        nn = neg_freq[w] if w in neg_freq else 1
        ratio = pn/nn
        if ratio < 1:
            ratio = 1/ratio
        gram_ratio[w] = ratio
    return gram_ratio


def contains_digit(s):
    for c in s:
        if c.isdigit():
            return True
    return False


def process_ngram(freq_dict, pos_dict, neg_dict):
    to_remove = set()
    for g in freq_dict:
        use = True
        for t in g.split():
            if contains_digit(t):
                to_remove.add(g)
                break
    for g in to_remove:
        freq_dict.pop(g)
        if g in pos_dict:
            pos_dict.pop(g)
        if g in neg_dict:
            neg_dict.pop(g)
        
    
for i in range(max_gram):
    process_ngram(ngram_freq[i], ngram_pos_freq[i], ngram_neg_freq[i])
    ngram_ratio[i] = compute_ratio(ngram_freq[i], ngram_pos_freq[i], ngram_neg_freq[i])

In [412]:
sorted_ngram_ratio = []
for i in range(max_gram):
    sorted_ngram_ratio.append(sort(ngram_ratio[i], len(ngram_ratio[i])))
    print(sorted_ngram_ratio[i][:10])

[('boll', 144.0), ('paulie', 118.0), ('edie', 109.0), ('uwe', 101.0), ('antwone', 84.0), ('thunderbird', 71.0), ('goldsworthy', 65.0), ('beowulf', 60.0), ('gunga', 60.0), ('gypo', 60.0)]
[('uwe boll', 88.0), ('rob roy', 86.0), ('i waste', 83.0), ('terrible movie', 79.0), ('this garbage', 74.0), ('this crap', 71.5), ('prom night', 65.0), ('just awful', 61.0), ('be atrocious', 61.0), ('even worth', 60.0)]
[('bad film i', 144.0), ('be bad than', 91.0), ('bad movie i', 75.5), ('how bad this', 72.0), ('money on this', 69.0), ('skip this one', 63.0), ('the bad film', 61.49999999999999), ('do not waste', 60.0), ('save your money', 60.0), ('your time on', 59.0)]


In [413]:
def gram_to_idx( gram_freq, gram_ratio, low, high, ratio_threshold):
    i = 0
    gram_idx = {}
    for g, f in gram_freq.items():
        if f < low or f > high:
            continue
        r = gram_ratio[g]
        if r < ratio_threshold:
            continue
        gram_idx[g] = i
        i += 1
    return gram_idx

In [414]:
for i in range(max_gram):
    ngram_idx[i] = gram_to_idx(ngram_freq[i], ngram_ratio[i], ngram_low[i], ngram_high[i], ngram_ratio_threshold[i])

In [415]:
d = 0
for i in range(max_gram):
    print(len(ngram_idx[i]))
    d += len(ngram_idx[i]) 

trainX = np.zeros((pos_n+neg_n, d), dtype=np.float)
trainY = np.append(np.ones(pos_n, dtype=np.int), np.zeros(neg_n, dtype=np.int))

7153
13511
8978


In [442]:
def compute_weight(gram_idx, gram_ratio, n):
    gram_weight = {}
    for g in gram_idx:
        r = gram_ratio[g]
        gram_weight[g] = r**(0.05*(n+2))
    return gram_weight

In [443]:
for i in range(max_gram):
    ngram_weight[i] = compute_weight(ngram_idx[i], ngram_ratio[i], i)

In [444]:
print(sort(ngram_weight[2], 100))

[('bad film i', 2.701920077041227), ('be bad than', 2.4649509317268694), ('bad movie i', 2.3745941323740714), ('how bad this', 2.352158045049347), ('money on this', 2.332221626160242), ('skip this one', 2.2901720489235826), ('the bad film', 2.279161096122349), ('do not waste', 2.2679331552660544), ('save your money', 2.2679331552660544), ('your time on', 2.2603224696268156), ('hour of my', 2.2603224696268156), ('my money back', 2.2288073840335185), ('total waste of', 2.2288073840335185), ('not waste your', 2.2123568222761167), ('waste your money', 2.2039445754429603), ('bill and ted', 2.19540189742749), ('minute of my', 2.1867241478865562), ('a total waste', 2.17790642448278), ('the only redeeming', 2.17790642448278), ('excuse for a', 2.173443463961896), ('your time with', 2.159830011764466), ('the bad movie', 2.151494226896409), ('acting be horrible', 2.141127368338324), ('well worth watch', 2.1315255132709487), ('piece of crap', 2.1315255132709487), ('possibly the bad', 2.13152551327

In [445]:
def encode(ngram, i, ngram_idx, ngram_weight):
    encoded = np.zeros(d, dtype=np.float)
    st = 0
    for n in range(max_gram):
        gs = ngram[n][i]
        idx_dict = ngram_idx[n]
        weight_dict = ngram_weight[n]
        decay = 0.5
        discount = {}
        for g in gs:
            if g not in idx_dict:
                continue
            disc = 1
            if g in discount:
                disc = discount[g]
            else:
                discount[g] = 1
            encoded[idx_dict[g]+st] += disc*weight_dict[g]
            discount[g] *= decay
        st += len(idx_dict)
    return encoded

In [446]:
def batch_encode(trainX):
    for i in range(pos_n):
        trainX[i] = encode(ngram_pos, i, ngram_idx, ngram_weight)
    for i in range(neg_n):
        trainX[i + pos_n] = encode(ngram_neg, i, ngram_idx, ngram_weight)

In [447]:
# Convert trainset to encoded vectors
batch_encode(trainX)

In [448]:
# Shuffle the trainset
indices = np.arange(pos_n+neg_n)
np.random.shuffle(indices)
trainX = trainX[indices]
trainY = trainY[indices]

In [449]:
np.unique(trainX[0])

array([ 0.        ,  1.07516943,  1.09149343,  1.09199281,  1.09202729,
        1.09337374,  1.09450312,  1.09724081,  1.09808844,  1.10553592,
        1.11612317,  1.12349136,  1.12981083,  1.13087857,  1.14282139,
        1.14869835,  1.16275263,  1.18022078,  1.19680497,  1.20097687,
        1.20673027,  1.21087631,  1.24103837,  1.25308888,  1.26724697,
        1.27305012,  1.29138103,  1.29400731,  1.32131546,  1.33784144,
        1.33895107,  1.35096004,  1.36082211,  1.36604026,  1.37416605,
        1.39038917,  1.43096908,  1.45225261,  1.51571657,  1.55184557,
        1.6140618 ,  1.62053674,  1.62222457,  1.86358668,  1.87560356])

In [434]:
lr_clf = LogisticRegression(C=1000, solver='sag', max_iter=100)

lr_clf.fit(trainX, trainY)



LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [450]:
lr_clf_2 = LogisticRegression(C=1000, solver='sag', max_iter=100)

lr_clf_2.fit(trainX, trainY)



LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [435]:
testX = np.zeros((test_n, d), dtype=np.float)
ngram_test = [[] for i in range(max_gram)]


def process_test_files(files, dir):
    for file in files:
        with open(os.path.join(dir, file), 'r') as f:
            review = f.readline()
            res = process_sent(review, max_gram)
            for i in range(max_gram):
                ngram_test[i].append(res[i])
                

process_test_files(test_fs, test_dir)

In [436]:
lr_clf.score(trainX, trainY)

0.99924000000000002

In [451]:
lr_clf_2.score(trainX, trainY)

0.97855999999999999

In [452]:
def batch_encode_test(testX):
    for i in range(len(testX)):
        testX[i] = encode(ngram_test, i, ngram_idx, ngram_weight)

batch_encode_test(testX)

In [453]:
np.unique(testX[2])

array([ 0.        ,  1.07683846,  1.08360799,  1.08398133,  1.09622524,
        1.10527855,  1.1320792 ,  1.14021705,  1.16275263,  1.21271526,
        1.22529527,  1.23114441,  1.24186849,  1.24504869,  1.26527867,
        1.28056168,  1.34832695,  1.41884047,  1.43096908,  1.43287697,
        1.46061751,  1.46186644,  1.47577316,  1.48956783,  1.50536863,
        1.57312444,  1.70911529,  1.83841629,  1.96600624,  1.98734075,
        2.18941675])

In [454]:
pred = lr_clf_2.predict(testX)

In [455]:
np.unique(pred, return_counts=True)

(array([0, 1]), array([5385, 5615]))

In [456]:
data = {'id':np.arange(11000), 'labels':pred }
df = pd.DataFrame(data=data)
df.to_csv('res_2.csv', index=False)