## Import Libraries

In [44]:
import os
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.metrics import accuracy_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

### A. Load the data

In [45]:
def load_data(split_name='train', columns=['text', 'stars']):
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        df = df.loc[:,columns]
        print("succeed!")
        return df
    except:
        print("Failed, then try to ")
        print(f"select all columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        return df

In [46]:
train_df = load_data('train', columns=['text'])

select [text] columns from the train split
succeed!


In [47]:
train_df.head()

Unnamed: 0,text
0,Nice to have a diner still around. Food was go...
1,"Tried this a while back, got the fried chicken..."
2,I expected more pork selections on menu. Food ...
3,YUMMY!!! This place is phenomenal. It is Price...
4,The Truffle Macaroni & Cheese and Potatoes Au ...


In [48]:
test_df = load_data('test')

select [text, stars] columns from the test split
Failed, then try to 
select all columns from the test split


In [49]:
len(test_df)

2000

## 2. Preprocessing

In [53]:
def lower(s):
    """
    :param s: a string.
    return a string with lower characters
    Note that we allow the input to be nested string of a list.
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: 'text mining is to identify useful information.'
    """
    if isinstance(s, list):
        return [lower(t) for t in s]
    if isinstance(s, str):
        return s.lower()
    else:
        raise NotImplementedError("unknown datatype")


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)


def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results

def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

import numpy as np

def get_onehot_vector(feats, feats_dict):
    """
    :param data: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

## 3. Baselines

In [60]:
train_df = load_data('train')
valid_df = load_data('valid')

select [text, stars] columns from the train split
succeed!
select [text, stars] columns from the valid split
succeed!


The split above is what we have done for you. You can use the data as you like.

In [62]:
x_train = train_df['text']
y_train = train_df['stars']

In [63]:
#grid with only analyzers == word and stopwords
tfidf = TfidfVectorizer(tokenizer=tokenize)
lr = LogisticRegression()
steps = [('tfidf', tfidf),('lr', lr)]
pipe = Pipeline(steps)
    # Creating Parameter Space
# parameter for tdidf  
analyzer  = ['word']
ngram_range = [(1,1),(1,2),(2,2)]
norm = ['l1', 'l2']
stop_words=['english','None']
# parameter for Logistic Regression 
penalty = ['l1', 'l2', 'elasticnet', 'none']
random_state = [42]
n_jobs = [-1]
C = np.logspace(-3,3,3)
solver = ['saga']


    # Creating a dictionary of all the parameter options 
parameters = dict(tfidf__analyzer=analyzer,
                  tfidf__ngram_range=ngram_range,
                  tfidf__norm=norm,
                  tfidf__stop_words=stop_words,
                  lr__penalty=penalty,
                  lr__random_state=random_state,
                  lr__n_jobs=n_jobs,
                  lr__solver=solver,
                  lr__C=C
                 )

cv = 5

In [32]:
%%time
    # Creating a grid search object
model = GridSearchCV(pipe, parameters,scoring= 'accuracy', cv=cv, n_jobs = -1, verbose = 30)
    # Fitting the grid search
model_result=model.fit(x_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 151 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 153 tasks      | elapsed:  5.8min
[Paralle

[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 281 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 282 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 283 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 285 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 286 tasks      | elapsed: 12.9min
[Paralle

[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 409 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 410 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 411 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 414 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 415 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 417 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 419 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 420 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 421 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 422 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 423 tasks      | elapsed: 17.9min
[Paralle

[Parallel(n_jobs=-1)]: Done 540 tasks      | elapsed: 316.9min
[Parallel(n_jobs=-1)]: Done 541 tasks      | elapsed: 317.2min
[Parallel(n_jobs=-1)]: Done 542 tasks      | elapsed: 317.5min
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed: 317.5min
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed: 317.5min
[Parallel(n_jobs=-1)]: Done 545 tasks      | elapsed: 317.5min
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed: 317.5min
[Parallel(n_jobs=-1)]: Done 547 tasks      | elapsed: 317.5min
[Parallel(n_jobs=-1)]: Done 548 tasks      | elapsed: 317.8min
[Parallel(n_jobs=-1)]: Done 549 tasks      | elapsed: 317.8min
[Parallel(n_jobs=-1)]: Done 550 tasks      | elapsed: 318.0min
[Parallel(n_jobs=-1)]: Done 551 tasks      | elapsed: 318.1min
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed: 318.3min
[Parallel(n_jobs=-1)]: Done 553 tasks      | elapsed: 318.3min
[Parallel(n_jobs=-1)]: Done 554 tasks      | elapsed: 318.3min
[Parallel(n_jobs=-1)]: Done 555 tasks      | elapsed: 3

[Parallel(n_jobs=-1)]: Done 671 tasks      | elapsed: 327.0min
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed: 327.2min
[Parallel(n_jobs=-1)]: Done 673 tasks      | elapsed: 327.3min
[Parallel(n_jobs=-1)]: Done 674 tasks      | elapsed: 327.3min
[Parallel(n_jobs=-1)]: Done 675 tasks      | elapsed: 327.3min
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed: 327.3min
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed: 327.3min
[Parallel(n_jobs=-1)]: Done 678 tasks      | elapsed: 327.3min
[Parallel(n_jobs=-1)]: Done 679 tasks      | elapsed: 327.7min
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed: 327.8min
[Parallel(n_jobs=-1)]: Done 681 tasks      | elapsed: 328.2min
[Parallel(n_jobs=-1)]: Done 682 tasks      | elapsed: 328.3min
[Parallel(n_jobs=-1)]: Done 683 tasks      | elapsed: 328.3min
[Parallel(n_jobs=-1)]: Done 684 tasks      | elapsed: 328.3min
[Parallel(n_jobs=-1)]: Done 685 tasks      | elapsed: 328.3min
[Parallel(n_jobs=-1)]: Done 686 tasks      | elapsed: 3

Wall time: 5h 49min 49s


In [33]:
print("Best: %f using %s" % (model_result.best_score_, model_result.best_params_))

Best: 0.577000 using {'lr__C': 1.0, 'lr__n_jobs': -1, 'lr__penalty': 'l2', 'lr__random_state': 42, 'lr__solver': 'saga', 'tfidf__analyzer': 'word', 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__stop_words': 'english'}


In [64]:
x_valid = valid_df['text']
y_valid = valid_df['stars']
tfidf = TfidfVectorizer(tokenizer=tokenize,analyzer='word',norm='l2',ngram_range=(1,2),stop_words='english')
lr = LogisticRegression(C=1.0,n_jobs=-1,penalty='l2',random_state=42,solver='saga')
steps = [('tfidf', tfidf),('lr', lr)]
pipe = Pipeline(steps)
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.68      0.88      0.76       517
           2       0.43      0.15      0.23       278
           3       0.42      0.43      0.42       344
           4       0.51      0.56      0.53       427
           5       0.72      0.68      0.70       434

    accuracy                           0.59      2000
   macro avg       0.55      0.54      0.53      2000
weighted avg       0.57      0.59      0.57      2000




[[455  14  28  11   9]
 [122  43  83  25   5]
 [ 52  35 147  95  15]
 [ 22   5  75 237  88]
 [ 22   4  15  98 295]]
accuracy 0.5885


###  The best Logistic Regression model is
###  Logistic Regression parameter: 'Inverse regularization parameter': 1.0 'lr__penalty': 'l2', 'lr__solver': 'saga'
###  TDIDF parameter:'tfidf__analyzer': 'word', 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__stop_words': 'english'
### The accuracy is 0.5770