In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import nltk 

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [3]:
train_df = load_data('train', columns=['text', 'stars'])
valid_df = load_data('valid', columns=['text', 'stars'])
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'])

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


In [4]:
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

In [5]:
# Prepare the data.
# As an example, we only use the text data. 
x_train = train_df['text']
y_train = train_df['stars']
  
x_valid = valid_df['text']
y_valid = valid_df['stars']

x_test = test_df['text']


In [6]:
tfidf = TfidfVectorizer(tokenizer=tokenize) 
lr2 = LogisticRegression()
steps = [('tfidf', tfidf),('lr', lr2)]
pipe2 = Pipeline(steps)

In [8]:
c = np.logspace(-4, 4, 50)
penalty = ['l2']
max_df = [0.25, 0.5, 0.75]
parameters = dict(tfidf__max_df = max_df, lr__C = c, lr__penalty = penalty)
clf_GS = GridSearchCV(pipe2, parameters, cv=2, n_jobs=-1)
best_clf = clf_GS.fit(x_train, y_train)

print('Best Penalty:', clf_GS.best_estimator_.get_params()['lr__penalty'])
print('Best C:', clf_GS.best_estimator_.get_params()['lr__C'])
print('Max df:', clf_GS.best_estimator_.get_params()['tfidf__max_df'])

Best Penalty: l2
Best C: 1.2067926406393288
Max df: 0.75


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
tfidf = TfidfVectorizer(tokenizer=tokenize, max_df=0.75, ngram_range=(1,2)) 
lr2 = LogisticRegression(C=159.99, penalty='l2')
steps = [('tfidf', tfidf),('lr', lr2)]
pipe2 = Pipeline(steps)

In [10]:
pipe2.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.75, ngram_range=(1, 2),
                                 tokenizer=<function tokenize at 0x7f9ea135ce50>)),
                ('lr', LogisticRegression(C=159.99))])

In [8]:
y_pred = pipe2.predict(x_valid)
print(classification_report(y_valid, y_pred, digits = 4))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1     0.7209    0.8333    0.7730       282
           2     0.3398    0.2574    0.2929       136
           3     0.4633    0.3868    0.4216       212
           4     0.5199    0.4764    0.4972       466
           5     0.7766    0.8308    0.8028       904

    accuracy                         0.6625      2000
   macro avg     0.5641    0.5569    0.5575      2000
weighted avg     0.6460    0.6625    0.6523      2000




[[235  24   9   3  11]
 [ 45  35  37  14   5]
 [ 20  31  82  62  17]
 [ 11  12  38 222 183]
 [ 15   1  11 126 751]]
accuracy 0.6625


 We find the second model (pipe2) has higher accuracy, then we use the second model to make predictions on test data. In practice, you may not only focus on the accuracy, but also other metrics (precision, recall, f1), since the label distribution is not always balanced.

### (3) Generate predictions on the test set

In [9]:
predict_test = pipe2.predict(x_test)

In [10]:
predict_test

array([5, 5, 1, ..., 1, 1, 2])

In [11]:
pred_df = pd.DataFrame({'stars': predict_test, 'review_id': test_df['review_id']})
pred_df.to_csv('pred.csv', index=False)

 Then you may submit the predictions `pred.csv` on the test set. 