# 0. Package Dependency

- [nltk](https://www.nltk.org)
- [sklearn](http://scikit-learn.org/stable/)

# 1. Data Preprocessing

In [12]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WordPunctTokenizer

# Load data
trn_texts = open("trn-reviews.txt").read().strip().lower().split("\n")
trn_labels = open("trn-labels.txt").read().strip().lower().split("\n")
print("Training data ...")
print("%d, %d" % (len(trn_texts), len(trn_labels)))
print(trn_texts[:2])
print(trn_labels[:5])

dev_texts = open("dev-reviews.txt").read().strip().split("\n")
dev_labels = open("dev-labels.txt").read().strip().split("\n")
print("Development data ...")
print("%d, %d" % (len(dev_texts), len(dev_labels)))
dev_tokens = WordPunctTokenizer().tokenize_sents(dev_texts)
trn_tokens = WordPunctTokenizer().tokenize_sents(trn_texts)

Training data ...
40000, 40000
["love the staff, love the meat, love the place. prepare for a long line around lunch or dinner hours. they ask you how you want you meat, lean or something maybe, i can't remember. just say you don't want it too fatty. get a half sour pickle and a hot pepper. hand cut french fries too.", "super simple place but amazing nonetheless. it's been around since the 30's and they still serve the same thing they started with: a bologna and salami sandwich with mustard. staff was very helpful and friendly."]
['5', '5', '5', '5', '4']
Development data ...
5000, 5000


In [18]:
import numpy as np
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf-8')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model
glove6B = loadGloveModel('glove.6B/glove.6B.50d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [None]:
print(glove6B['the'])

In [None]:
print(123)

In [15]:
trn_sentence_vectors = []
for tokens in trn_tokens:
    sentenceVector = np.zeros(50)
    tokenCount = 0
    for token in token:
        tokenVectors.append(glove6B[token])
        tokenCount += 1
    trn_sentence_vectors.append(np.array(to))

['Great', 'Great', 'Great', 'place', 'to', 'eat', '!', 'I', 'have', 'a', 'friend', 'in', 'the', 'casino', 'industry', 'that', 'always', 'gets', 'the', 'hook', 'up', 'here', '.', 'I', "'", 'm', 'not', 'a', 'seafood', 'person', 'but', 'its', 'unlimited', 'crab', 'legs', 'first', 'of', 'all', 'so', 'theres', 'that', '.', 'I', 'always', 'see', 'my', 'friends', 'grab', 'like', '20', '+', 'of', 'those', 'things', '.', 'I', 'usually', 'get', 'the', 'prime', 'rib', 'and', 'other', 'things', 'that', 'go', 'with', 'it', '.', 'Nothing', 'besides', 'the', 'prime', 'rib', ',', 'crab', 'legs', ',', 'and', 'lamb', 'really', 'stick', 'out', 'as', 'far', 'as', 'meat', '.', 'There', 'is', 'a', 'asian', 'food', 'section', ',', 'dessert', ',', 'and', 'from', 'what', 'I', 'can', 'tell', 'a', 'cross', 'between', 'greek', 'and', 'indian', 'section', '.', 'The', 'food', 'was', 'great', '.', 'You', 'can', 'order', 'booze', 'of', 'course', 'but', 'it', 'cost', 'extra', '.', 'This', 'is', 'the', 'pricing', ';', 

40000

# 2. Feature Extraction

Please refer to the document of [_CountVectorizer_](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for the parameters of this function. 

In [78]:
choice = 3

if choice == 1:
    print("Preprocessing without any feature selection")
    vectorizer = CountVectorizer(lowercase=False)
    # vocab size 77166
elif choice == 2:
    print("Lowercasing all the tokens")
    vectorizer = CountVectorizer(lowercase=True)
    # vocab size 60610
elif choice == 3:
    vectorizer = CountVectorizer(lowercase=True, min_df=0.017, max_df=0.95)
elif choice == 4:
    vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), min_df=0.017, max_df=0.95)
else: 
    raise ValueError("Unrecognized value: choice = %d" % choice)

trn_data = vectorizer.fit_transform(trn_texts)
print(trn_data.shape)
# print(trn_data[0])
dev_data = vectorizer.transform(dev_texts)
print(dev_data.shape)

(40000, 686)
(5000, 686)


# 3. Logistic Regression

Please refer to the document of [_LogisticRegression_](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) for the parameters of this function. 

In [96]:
from sklearn.linear_model import LogisticRegression

# Define a LR classifier
classifier = LogisticRegression(solver="liblinear", multi_class="ovr", penalty='l1')
classifier.fit(trn_data, trn_labels)

# Measure the performance on training and dev data
print("Training accuracy = %f" % classifier.score(trn_data, trn_labels))
print("Dev accuracy = %f", classifier.score(dev_data, dev_labels))

Training accuracy = 0.628750
Dev accuracy = %f 0.63


# Accuracy log with dfferent parameters
lower case, stop words
- Training accuracy = 0.909475
- Dev accuracy = %f 0.6014

lower case
- Training accuracy = 0.917175
- Dev accuracy = %f 0.6146

lowercase=True, stop_words='english', max_features=20000
- Training accuracy = 0.879750
- Dev accuracy = %f 0.5976

lowercase=True, max_df=0.8
- Training accuracy = 0.917475
- Dev accuracy = %f 0.6136

lowercase=True, max_df=0.7
- Training accuracy = 0.917275
- Dev accuracy = %f 0.6132

lowercase=True, max_df=0.9
- Training accuracy = 0.917225
- Dev accuracy = %f 0.6148

lowercase=True, max_df=0.95
- Training accuracy = 0.917175
- Dev accuracy = %f 0.6146

lowercase=True, max_df=0.925
- Training accuracy = 0.917175
- Dev accuracy = %f 0.6146

lowercase=True, max_df=0.875
- Training accuracy = 0.917475
- Dev accuracy = %f 0.6136

lowercase=True, min_df=0.01
- Training accuracy = 0.652525
- Dev accuracy = %f 0.6242

lowercase=True, min_df=0.02
- Training accuracy = 0.623700
- Dev accuracy = %f 0.6246

lowercase=True, min_df=0.03
- Training accuracy = 0.604925
- Dev accuracy = %f 0.6104

lowercase=True, min_df=0.015
- Training accuracy = 0.633650
- Dev accuracy = %f 0.6264

lowercase=True, min_df=0.0175
- Training accuracy = 0.628725
- Dev accuracy = %f 0.6294

lowercase=True, min_df=0.016
- Training accuracy = 0.631175
- Dev accuracy = %f 0.6252

lowercase=True, min_df=0.017
- Training accuracy = 0.629000
- Dev accuracy = %f 0.6294

lowercase=True, min_df=0.018
- Training accuracy = 0.628525
- Dev accuracy = %f 0.6286

lowercase=True, min_df=0.017, max_df=0.95
- Training accuracy = 0.629000
- Dev accuracy = %f 0.6294