# 1. Word Embeddings

In [1]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WordPunctTokenizer

# Load data
trn_texts = open("trn-reviews.txt").read().strip().lower().split("\n")
trn_labels = open("trn-labels.txt").read().strip().lower().split("\n")
print("Training data ...")
print("%d, %d" % (len(trn_texts), len(trn_labels)))
print(trn_texts[:2])
print(trn_labels[:5])

dev_texts = open("dev-reviews.txt").read().strip().split("\n")
dev_labels = open("dev-labels.txt").read().strip().split("\n")
print("Development data ...")
print("%d, %d" % (len(dev_texts), len(dev_labels)))
dev_tokens = WordPunctTokenizer().tokenize_sents(dev_texts)
trn_tokens = WordPunctTokenizer().tokenize_sents(trn_texts)

Training data ...
40000, 40000
["love the staff, love the meat, love the place. prepare for a long line around lunch or dinner hours. they ask you how you want you meat, lean or something maybe, i can't remember. just say you don't want it too fatty. get a half sour pickle and a hot pepper. hand cut french fries too.", "super simple place but amazing nonetheless. it's been around since the 30's and they still serve the same thing they started with: a bologna and salami sandwich with mustard. staff was very helpful and friendly."]
['5', '5', '5', '5', '4']
Development data ...
5000, 5000


In [2]:
import numpy as np
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf-8')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model
glove6B = loadGloveModel('glove.6B/glove.6B.50d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [3]:
def sentenceTokensToVectors(sentenceTokens, model):
    trn_sentence_vectors = []
    for tokens in sentenceTokens:
        sentenceVector = np.zeros(50)
        tokenCount = 0
        for token in tokens:
            tokenCount += 1
            if token in glove6B:
                sentenceVector += glove6B[token]
            else:
                sentenceVector += glove6B['unk']
        if tokenCount > 0:
            sentenceVector /= tokenCount
        trn_sentence_vectors.append(sentenceVector)
    return np.array(trn_sentence_vectors)
trn_sentence_vectors = sentenceTokensToVectors(trn_tokens, glove6B)
dev_sentence_vectors = sentenceTokensToVectors(dev_tokens, glove6B)

## 1.1 Logistic Regression with embedding

In [4]:
from sklearn.linear_model import LogisticRegression

# Define a LR classifier
classifier = LogisticRegression()
classifier.fit(trn_sentence_vectors, trn_labels)

# Measure the performance on training and dev data
print("Training accuracy = %f" % classifier.score(trn_sentence_vectors, trn_labels))
print("Dev accuracy = %f", classifier.score(dev_sentence_vectors, dev_labels))



Training accuracy = 0.515975
Dev accuracy = %f 0.5322


## 1.2 CountVectorizer with embedding

In [5]:
choice = 3

if choice == 1:
    print("Preprocessing without any feature selection")
    vectorizer = CountVectorizer(lowercase=False)
    # vocab size 77166
elif choice == 2:
    print("Lowercasing all the tokens")
    vectorizer = CountVectorizer(lowercase=True)
    # vocab size 60610
elif choice == 3:
    vectorizer = CountVectorizer(lowercase=True, min_df=0.017, max_df=0.95)
elif choice == 4:
    vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), min_df=0.017, max_df=0.95)
else: 
    raise ValueError("Unrecognized value: choice = %d" % choice)

trn_data = vectorizer.fit_transform(trn_texts).toarray()
dev_data = vectorizer.transform(dev_texts).toarray()
print(dev_data.shape)

(5000, 686)


In [6]:
#augment data
combined_trn_data = np.concatenate((trn_data, trn_sentence_vectors), axis=1)
combined_dev_data = np.concatenate((dev_data, dev_sentence_vectors), axis=1)

In [7]:
# Define a LR classifier default
classifier = LogisticRegression()
classifier.fit(combined_trn_data, trn_labels)

# Measure the performance on training and dev data
print("Training accuracy = %f" % classifier.score(combined_trn_data, trn_labels))
print("Dev accuracy = %f", classifier.score(combined_dev_data, dev_labels))



Training accuracy = 0.641750
Dev accuracy = %f 0.6314


## 1.3 Best Parameters

In [96]:
from sklearn.linear_model import LogisticRegression

# Define a LR classifier
classifier = LogisticRegression(penalty='l1', C=0.2)
classifier.fit(combined_trn_data, trn_labels)

# Measure the performance on training and dev data
print("Training accuracy = %f" % classifier.score(combined_trn_data, trn_labels))
print("Dev accuracy = %f", classifier.score(combined_dev_data, dev_labels))



Training accuracy = 0.638350
Dev accuracy = %f 0.6328
