## 1. Loading data and saving predictions

In [15]:
import pandas as pd

### A. Loading data

In [16]:
def load_data(split_name='train', columns=['text', 'label'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "label" column is the labels (sentiment). 
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [17]:
train_df = load_data('train', columns=['text', 'label'], folder='data')
valid_df = load_data('valid', columns=['id','text', 'label'], folder='data') ### 'id' column is included for evaluate.py
# the test set labels (the 'label' column) are unavailable! So the following code will instead return all columns
test_df = load_data('test_no_label', columns=['id', 'text'], folder='data')

select [text, label] columns from the train split
Success
select [id, text, label] columns from the valid split
Success
select [id, text] columns from the test_no_label split
Success


In [18]:
train_df.head()  #### Verify.

Unnamed: 0,text,label
0,Two Wolfgang Petersen directed films together ...,5
1,For fans of the series and the movies\r\nthis ...,4
2,"I love the movie. The Blu-ray was fine, but it...",3
3,You don't know what is going on until the end ...,3
4,"We only watched a few minutes of the movie, du...",1


## 2. Preprocessing

### A. Text data processing recap

In [19]:
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hill6\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hill6\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

def lower(s):
    """
    :param s: a string.
    return a string with lower characters
    Note that we allow the input to be nested string of a list.
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: 'text mining is to identify useful information.'
    """
    if isinstance(s, list):
        return [lower(t) for t in s]
    if isinstance(s, str):
        return s.lower()
    else:
        raise NotImplementedError("unknown datatype")


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)


def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results

def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

import numpy as np

def get_onehot_vector(feats, feats_dict):
    """
    :param data: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

def lemmatize(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of lemmatized words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return [lemmatizer.lemmatize(token) for token in tokens]

## 2. Training

### TF-IDF+ SVD + LR

In [21]:
!pip install -U scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl (10.6 MB)
     --------------------------------------- 10.6/10.6 MB 29.7 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.4.0-py3-none-any.whl (17 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.4.1.post1 threadpoolctl-3.4.0


In [22]:
x_train = train_df['text']
y_train = train_df['label']
x_valid = valid_df['text']
y_valid = valid_df['label']

In [23]:
from sklearn.decomposition import TruncatedSVD
tfidf = TfidfVectorizer(tokenizer=tokenize)
lr = LogisticRegression(tol=5e-3,max_iter=1000)
svd = TruncatedSVD(n_components=500)
steps = [('tfidf', tfidf),('Truncated SVD',svd),('lr', lr)]
pipe = Pipeline(steps)
print(pipe)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function tokenize at 0x0000020364BF2480>)),
                ('Truncated SVD', TruncatedSVD(n_components=500)),
                ('lr', LogisticRegression(max_iter=1000, tol=0.005))])


In [24]:
pipe.fit(x_train, y_train)



In [25]:
y_pred = pipe.predict(x_valid)

print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))
results = pd.DataFrame({'id': valid_df['id'], 'text': valid_df['text'], 'label': y_pred})
results.to_csv('data/valid_pred.csv', index=False)
test_pred = pipe.predict(test_df['text'])
test_results = pd.DataFrame({'id': test_df['id'], 'text': test_df['text'], 'label': test_pred})
test_results.to_csv('pred.csv', index=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           1       0.58      0.37      0.45       295
           2       0.00      0.00      0.00       198
           3       0.42      0.58      0.49       508
           4       0.37      0.41      0.39       523
           5       0.54      0.60      0.57       476

    accuracy                           0.45      2000
   macro avg       0.38      0.39      0.38      2000
weighted avg       0.42      0.45      0.43      2000




[[109   0  97  51  38]
 [ 38   0 105  38  17]
 [ 12   0 295 157  44]
 [ 15   0 149 217 142]
 [ 15   0  50 125 286]]
accuracy 0.4535
                     id                                               text
0   A3EMGD8RAEOK64_2907  On our trip this past summer to Lunenberg, Nov...
1   A2BOWU2PX28BET_5501  Excellent!! Most remakes fall short of the ori...
2  A100WO06OQR8BQ_10469  I started to watch this movie but it is such a...
3  A2H4LKU7CPIUU9_11364  Well! I must be terribly jaded. Or I am comple...
4