# 0. Package Dependency

- [nltk](https://www.nltk.org)
- [sklearn](http://scikit-learn.org/stable/)

# 1. Data Preprocessing

In [1]:
# Load packages
from sklearn.feature_extraction.text import CountVectorizer

# Load data
trn_texts = open("trn-reviews.txt").read().strip().split("\n")
trn_labels = open("trn-labels.txt").read().strip().split("\n")
print("Training data ...")
print("%d, %d" % (len(trn_texts), len(trn_labels)))

dev_texts = open("dev-reviews.txt").read().strip().split("\n")
dev_labels = open("dev-labels.txt").read().strip().split("\n")
print("Development data ...")
print("%d, %d" % (len(dev_texts), len(dev_labels)))

Training data ...
40000, 40000
Development data ...
5000, 5000


# 2. Feature Extraction

Please refer to the document of [_CountVectorizer_](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for the parameters of this function. 

In [3]:
choice = 3
if choice == 1:
    print("Preprocessing without any feature selection")
    vectorizer = CountVectorizer(lowercase=False)
    # vocab size 77166
elif choice == 2:
    print("Lowercasing all the tokens")
    vectorizer = CountVectorizer(lowercase=True)
    # vocab size 60610
elif choice == 3: 
    print("Lowercasing all the tokens, removing low frequency words, adjusting the ngram-range, limiting features")
    vectorizer = CountVectorizer(lowercase=True, min_df=7e-5, ngram_range=(1, 2), max_features=10000)
elif choice == 4: 
    vectorizer = CountVectorizer(lowercase=True, min_df=7e-5)
else:
    raise ValueError("Unrecognized value: choice = %d" % choice)

trn_data = vectorizer.fit_transform(trn_texts)
print(trn_data.shape)
# print(vectorizer.get_feature_names())
# print(trn_data[:3,])
dev_data = vectorizer.transform(dev_texts)
print(dev_data.shape)
# print(vectorizer.get_feature_names())

Lowercasing all the tokens, removing low frequency words, adjusting the ngram-range, limiting features
(40000, 10000)
(5000, 10000)


# 3. Logistic Regression

Please refer to the document of [_LogisticRegression_](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) for the parameters of this function. 

In [4]:
from sklearn.linear_model import LogisticRegression

# Define a LR classifier
classifier = LogisticRegression(solver="sag", multi_class="multinomial", penalty="l2")
classifier.fit(trn_data, trn_labels)

# Measure the performance on training and dev data
print("Training accuracy = %f" % classifier.score(trn_data, trn_labels))
print("Dev accuracy = %f", classifier.score(dev_data, dev_labels))

Training accuracy = 0.812050
Dev accuracy = %f 0.6504


