In [None]:
import numpy as np
import pandas as pd
import os
import sklearn.linear_model
import sklearn.pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv('x_train.csv')
y_train_df = pd.read_csv('y_train.csv')

N, n_cols = x_train_df.shape
print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
print("Shape of y_train_df: %s" % str(y_train_df.shape))

# Print out the first five rows and last five rows
tr_text_list = x_train_df['text'].values.tolist()
rows = np.arange(0, 5)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

print("...")
rows = np.arange(N - 5, N)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)
row     0 | y = 0 | Oh and I forgot to also mention the weird color effect it has on your phone.
row     1 | y = 0 | THAT one didn't work either.
row     2 | y = 0 | Waste of 13 bucks.
row     3 | y = 0 | Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.
row     4 | y = 0 | None of the three sizes they sent with the headset would stay in my ears.
...
row  2395 | y = 1 | The sweet potato fries were very good and seasoned well.
row  2396 | y = 1 | I could eat their bruschetta all day it is devine.
row  2397 | y = 1 | Ambience is perfect.
row  2398 | y = 1 | We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.
row  2399 | y = 1 | Service was good and the company was better!


In [None]:
x_train_df

Unnamed: 0,website_name,text
0,amazon,Oh and I forgot to also mention the weird colo...
1,amazon,THAT one didn't work either.
2,amazon,Waste of 13 bucks.
3,amazon,"Product is useless, since it does not have eno..."
4,amazon,None of the three sizes they sent with the hea...
...,...,...
2395,yelp,The sweet potato fries were very good and seas...
2396,yelp,I could eat their bruschetta all day it is dev...
2397,yelp,Ambience is perfect.
2398,yelp,We ordered the duck rare and it was pink and t...


In [None]:
x_train_np = x_train_df['text'].to_numpy()
print(x_train_np)

['Oh and I forgot to also mention the weird color effect it has on your phone.'
 "THAT one didn't work either." 'Waste of 13 bucks.' ...
 'Ambience is perfect.'
 'We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.'
 'Service was good and the company was better!']


In [None]:
len(x_train_np)

2400

In [None]:
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens

    We assume that *whitespace* divides tokens.

    Args
    ----
    raw_text : string

    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split() # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        # # ['?', '!', '&', '+', '_', '.', ',', '"', '/', '(', ')', '-']
        for punc in set(string.punctuation):
            cur_token = cur_token.replace(punc, "")

        if cur_token and not cur_token.isnumeric():
          # Turn to lower case
          clean_token = cur_token.lower()
          # Replace the cleaned token into the original list
          list_of_tokens[pp] = clean_token
    return list_of_tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Collect frequency of all relevant words (all non-stopwords)
word_freq = dict()
for text in x_train_np:
  tokens = tokenize_text(text)
  for token in tokens:
    if len(token) > 1 and token not in stop_words:
      if token not in word_freq:
        word_freq[token] = 1
      else:
        word_freq[token] += 1

sorted_word_freq = list(sorted(word_freq, key=word_freq.get, reverse=True))
for w in sorted_word_freq[:10]:
  print("%5d %s" % (word_freq[w], w))

  183 good
  162 great
  141 movie
  137 phone
  122 film
  112 one
  100 place
   99 food
   95 like
   85 service


In [None]:
vocab_list = [w for w in sorted_word_freq if word_freq[w] >= 4]
vocab_dict = dict()
for id, word in enumerate(vocab_list):
  vocab_dict[word] = id

In [None]:
def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text

    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in tokenize_text(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V

In [None]:
# Test: transform_text_into_feature_vector("dinosaur nonsense", vocab_dict)

In [None]:
N = len(x_train_np)
V = len(vocab_list)

y_tr_N = y_train_df['is_positive_sentiment'].to_numpy()
x_tr_NV = np.zeros((N, V))

for idx, text in enumerate(x_train_np):
  x_tr_NV[idx] = transform_text_into_feature_vector(text, vocab_dict)

print(x_tr_NV.shape)

(2400, 770)


In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=10000.0, max_iter=100)
clf.fit(x_tr_NV, y_tr_N)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
yhat_tr_N = clf.predict(x_tr_NV)
acc = np.mean(y_tr_N == yhat_tr_N)

print("Training accuracy: %.3f" % acc)

Training accuracy: 0.949


In [None]:
x_test = pd.read_csv('x_test.csv')
x_test_np = x_test['text'].to_numpy()

for line in x_test_np:
  x_V = transform_text_into_feature_vector(line, vocab_dict)
  print(clf.predict(x_V.reshape((1, V))), line)

In [None]:
## cross_validation

In [None]:
from sklearn.linear_model import LogisticRegression

fold_sizes = [3, 5, 10, 15, 20, 25, 30]  # The last one is for LOOCV
C_values = np.logspace(11, 16, 10)
print(C_values)

best_score = 0
best_C = None

for C in C_values:
    # Compute cross-validation score
    clf_lr = LogisticRegression(C=C, max_iter=100, solver='lbfgs')
    scores = cross_val_score(clf_lr.fit(x_tr_NV, y_tr_N), x_tr_NV, y_tr_N, cv=5, scoring='accuracy')
    mean_score = scores.mean()

    if mean_score > best_score:
      best_score = mean_score
      best_C = C

    # print(f"Mean accuracy using {k}-fold cross-validation: {mean_score:.4f}")

print("C: ", best_C)
# C:  0.4832930238571752 for 25 folds

In [None]:
clf = LogisticRegression(C=359381366380.4626, max_iter=1000)
clf.fit(x_tr_NV, y_tr_N)

yhat_tr_N = clf.predict(x_tr_NV)
acc = np.mean(y_tr_N == yhat_tr_N)

print("Training accuracy: %.3f" % acc)

In [None]:
x_test = pd.read_csv('x_test.csv')
x_test_np = x_test['text'].to_numpy()

for line in x_test_np:
  x_V = transform_text_into_feature_vector(line, vocab_dict)
  print(clf.predict(x_V.reshape((1, V))), line)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))

data = pd.read_csv('x_train.csv')
dataset = data['text'].to_numpy()
vectorizer = CountVectorizer(stop_words=stop_words, min_df=1, max_df=1.0, ngram_range=(1,1))
X = vectorizer.fit_transform(dataset)
vocab_size = len(vectorizer.vocabulary_)
print(vocab_size)

4378


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
## Solution to Part 1

import pandas as pd
import numpy as np
import sklearn

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import PredefinedSplit, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))

x_train_df = pd.read_csv('x_train.csv')
y_train_df = pd.read_csv('y_train.csv')
x_N = x_train_df['text'].to_numpy()
y_N = y_train_df['is_positive_sentiment'].to_numpy()

x_tr_N, x_va_N, y_tr_N, y_va_N = train_test_split(x_N, y_N, test_size=0.2, random_state=42)

my_bow_classifier_pipeline = Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(stop_words=stop_words, min_df=1, max_df=1.0, ngram_range=(1,1))),
    ('my_preproc', PolynomialFeatures(10, include_bias=False)),
    ('my_classifier', LogisticRegression(C=1.0, max_iter=100, random_state=101)),
])

my_parameter_grid_by_name = dict()
my_parameter_grid_by_name['my_bow_feature_extractor__min_df'] = [1, 2, 3]
my_parameter_grid_by_name['my_preproc__degree'] = [i for i in range(1, 4)]  # underfitting
my_parameter_grid_by_name['my_classifier__C'] = np.logspace(0, 6, 10) # overfitting

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
fold_sizes = [5, 10, 25, 50]
data = []

best_model = None
best_score = None

for k in fold_sizes:
  grid_searcher = GridSearchCV(
      my_bow_classifier_pipeline,
      my_parameter_grid_by_name,
      scoring='accuracy',
      cv=k,
      refit=True)

  grid_searcher.fit(x_tr_N, y_tr_N)

  gsearch_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()
  # param_keys = ['param_my_bow_feature_extractor__min_df', 'param_my_classifier__C']
  gsearch_results_df.sort_values(['rank_test_score'], inplace=True)

  df = gsearch_results_df[['param_my_bow_feature_extractor__min_df', 'param_my_classifier__C', 'split0_test_score', 'rank_test_score']][:10]
  df['fold_size'] = len(df['rank_test_score']) * [k]
  data.append(df)

  if not best_score or best_score < df['split0_test_score'].values[0]:
    best_score = df['split0_test_score'].values[0]
    best_model = grid_searcher

cv_data = pd.concat(data)
cv_data.sort_values(['split0_test_score'], inplace=True, ascending=False)
cv_data

In [None]:
yhat_va_N = grid_searcher.predict(x_va_N)
acc = np.mean(y_va_N == yhat_va_N)

print("Validation accuracy: %.3f" % acc)

Validation accuracy: 0.779


In [None]:
x_test = pd.read_csv('x_test.csv')
x_test_np = x_test['text'].to_numpy()
predictions = grid_searcher.predict_proba(x_test_np)

for idx, prediction in enumerate(predictions):
  # print(prediction, x_test_np[idx])
  print(prediction[1])

In [None]:
## Solution to Part 2

import pandas as pd
import numpy as np
import sklearn

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))

x_train_df = pd.read_csv('x_train.csv')
y_train_df = pd.read_csv('y_train.csv')
x_N = x_train_df['text'].to_numpy()
y_N = y_train_df['is_positive_sentiment'].to_numpy()

x_tr_N, x_va_N, y_tr_N, y_va_N = train_test_split(x_N, y_N, test_size=0.2, random_state=42)

my_bow_classifier_pipeline = Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(stop_words=stop_words, min_df=1, max_df=1.0, ngram_range=(1,2))),
    ('my_classifier', RandomForestClassifier()),
])

my_parameter_grid_by_name = dict()
my_parameter_grid_by_name['my_bow_feature_extractor__min_df'] = [1, 2, 3]
my_parameter_grid_by_name['my_bow_feature_extractor__ngram_range'] = [(1,1), (1,2), (2,2)]
my_parameter_grid_by_name['my_classifier__n_estimators'] = [50, 100, 150, 200]
my_parameter_grid_by_name['my_classifier__max_depth'] = [2, 4, 8, None] # overfitting

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
fold_sizes = [5, 10, 25, 50]
data = []

best_model = None
best_score = None

for k in fold_sizes:
  grid_searcher = GridSearchCV(
      my_bow_classifier_pipeline,
      my_parameter_grid_by_name,
      scoring='accuracy',
      cv=k,
      refit=True)

  grid_searcher.fit(x_tr_N, y_tr_N)

  gsearch_results_df = pd.DataFrame(grid_searcher.cv_results_).copy()
  # param_keys = ['param_my_bow_feature_extractor__min_df', 'param_my_classifier__C']
  gsearch_results_df.sort_values(['rank_test_score'], inplace=True)

  df = gsearch_results_df[[
      'param_my_bow_feature_extractor__min_df',
      'param_my_bow_feature_extractor__ngram_range',
      'param_my_classifier__n_estimators',
      'param_my_classifier__max_depth',
      'split0_test_score',
      'rank_test_score']][:10]

  df['fold_size'] = len(df['rank_test_score']) * [k]
  data.append(df)

  if not best_score or best_score < df['split0_test_score'].values[0]:
    best_score = df['split0_test_score'].values[0]
    best_model = grid_searcher

cv_data = pd.concat(data)
cv_data.sort_values(['split0_test_score'], inplace=True, ascending=False)
cv_data

In [None]:
yhat_va_N = grid_searcher.predict(x_va_N)
acc = np.mean(y_va_N == yhat_va_N)

print("Validation accuracy: %.3f" % acc)

Validation accuracy: 0.752
