## 1. Loading data and saving predictions

In [1]:
import pandas as pd

### A. Loading data

In [2]:
def load_data(split_name='train', columns=['text', 'label'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "label" column is the labels (sentiment). 
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

Then you can extract the data by specifying the desired split and columns

In [5]:
train_df = load_data('train', columns=['text', 'label'], folder='data')
valid_df = load_data('valid', columns=['id','text', 'label'], folder='data') ### id column is added for evaluate.py
test_df = load_data('test_no_label', columns=['id', 'text'], folder='data')

select [text, label] columns from the train split
Success
select [id, text, label] columns from the valid split
Success
select [id, text] columns from the test_no_label split
Success


In [6]:
train_df.head() #### Verify the data is loaded correctly

Unnamed: 0,text,label
0,Two Wolfgang Petersen directed films together ...,5
1,For fans of the series and the movies\r\nthis ...,4
2,"I love the movie. The Blu-ray was fine, but it...",3
3,You don't know what is going on until the end ...,3
4,"We only watched a few minutes of the movie, du...",1


## 2. Preprocessing

### A. Text data processing recap

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hill6\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hill6\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

def lower(s):
    """
    :param s: a string.
    return a string with lower characters
    Note that we allow the input to be nested string of a list.
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: 'text mining is to identify useful information.'
    """
    if isinstance(s, list):
        return [lower(t) for t in s]
    if isinstance(s, str):
        return s.lower()
    else:
        raise NotImplementedError("unknown datatype")


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)


def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results

def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

import numpy as np

def get_onehot_vector(feats, feats_dict):
    """
    :param data: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

Note that you can use the `map` function to apply your preprocessing functions into the dataframe.

In [13]:
for i in range(len(test_df)):
    try:
        tokenize(test_df.loc[i, 'text'])
    except: 
        print(i)

In [14]:
print(test_df.loc[1155])

id                                     A8NQVLIE0QVT4_7949
text    Great movie, even better dubb. Blu ray is the ...
Name: 1155, dtype: object


In [15]:
test_df['tokens'] = test_df['text'].map(tokenize).map(filter_stopwords).map(lower)
print(test_df['tokens'].head().to_string())

0    [on, trip, past, summer, lunenberg, ,, nova, s...
1    [excellent, !, !, most, remakes, fall, short, ...
2    [i, started, watch, movie, lousy, movie, i, st...
3    [well, !, i, must, terribly, jaded, ., or, i, ...
4    [dark, grim, --, fun, movie, ., watch, perform...


Besides `nltk`, `SpaCy` may also be useful.

You can explore it at https://spacy.io/

Let's install it with the following command (in terminal)

```bash
python -m pip install spacy
python -m spacy download en_core_web_sm
```

For more usage of SpaCy, you can refer to its documentation at this link: https://spacy.io/usage

## 2. Baselines

Finally, we provide two example baselines for your reference. The first baseline extracts TF-iDF features from texts and use logistic regression to generate prediction. The second baseline uses Convolutional Neural Networks (CNNs) to generate prediction from texts.


We only consider its first 3k training samples. It is just an example, you can use the data as you like.

### TF-IDF + LR

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

In [39]:
train_df = load_data('train')[:]
valid_df = load_data('valid')
x_train = train_df['text']
y_train = train_df['label']
x_valid = valid_df['text']
y_valid = valid_df['label']

select [text, label] columns from the train split
Success
select [text, label] columns from the valid split
Success


In [57]:
from sklearn.decomposition import TruncatedSVD
tfidf = TfidfVectorizer(tokenizer=tokenize)
lr = LogisticRegression(tol=5e-3,max_iter=1000)
svd = TruncatedSVD(n_components=500)
steps = [('tfidf', tfidf),('Truncated SVD',svd),('lr', lr)]
pipe = Pipeline(steps)
print(pipe)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function tokenize at 0x000002807D19FEC0>)),
                ('Truncated SVD', TruncatedSVD(n_components=500)),
                ('lr', LogisticRegression(max_iter=1000, tol=0.005))])


In [58]:
pipe.fit(x_train, y_train)



In [59]:
y_pred = pipe.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.58      0.54      0.56       295
           2       0.41      0.14      0.21       198
           3       0.47      0.58      0.52       508
           4       0.46      0.41      0.43       523
           5       0.58      0.68      0.63       476

    accuracy                           0.51      2000
   macro avg       0.50      0.47      0.47      2000
weighted avg       0.50      0.51      0.50      2000




[[160  18  66  24  27]
 [ 55  28  85  21   9]
 [ 28  19 297 117  47]
 [ 18   0 141 213 151]
 [ 15   4  41  90 326]]
accuracy 0.512


### CNN

The second baseline is a CNN model implemented with PyTorch.

First, use the following command to install pytorch (in terminal).

```bash
pip install torch
```

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import tqdm

In [None]:
train_text = train_df['text'].map(tokenize).map(filter_stopwords).map(stem)
valid_text = valid_df['text'].map(tokenize).map(filter_stopwords).map(stem)

In [None]:
word2id = {}
for tokens in train_text:
    for t in tokens:
        if not t in word2id:
            word2id[t] = len(word2id)
word2id['<pad>'] = len(word2id)

In [None]:
def texts_to_id_seq(texts, padding_length=50):
    records = []
    for tokens in texts:
        record = []
        for t in tokens:
            record.append(word2id.get(t, len(word2id)))
        if len(record) >= padding_length:
            records.append(record[:padding_length])
        else:
            records.append(record + [word2id['<pad>']] * (padding_length - len(record)))
    return records

In [None]:
train_seqs = texts_to_id_seq(train_text)

In [None]:
valid_seqs = texts_to_id_seq(valid_text)

In [None]:
class MyDataset(Dataset):
    
    def __init__(self, seq, y):
        assert len(seq) == len(y)
        self.seq = seq
        self.y = y-1
    
    def __getitem__(self, idx):
        return np.asarray(self.seq[idx]), self.y[idx]

    def __len__(self):
        return len(self.seq)

In [None]:
batch_size = 16

train_loader = DataLoader(MyDataset(train_seqs, y_train), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(MyDataset(valid_seqs, y_valid), batch_size=batch_size)

In [None]:
class mlp(nn.Module):
    def __init__(self):
        super(mlp, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(word2id)+1, embedding_dim=64)
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=64,
                      out_channels=64,
                      kernel_size=3,
                      stride=1),
            nn.MaxPool1d(kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=64,
                      out_channels=64,
                      kernel_size=3,
                      stride=1),
            nn.MaxPool1d(kernel_size=3, stride=1),
            nn.Dropout(0.5)
        )
        self.linear = nn.Linear(64, 5)
    
    def forward(self, x):
        x = self.embedding(x)
        x = torch.transpose(x, 1, 2)
        x = self.cnn(x)
        x = torch.max(x, dim=-1)[0]
        x = self.linear(x)
        return x

In [None]:
model = mlp()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
for e in range(1, 11):    
    print('epoch', e)
    model.train()
    total_acc = 0
    total_loss = 0
    total_count = 0
    with tqdm.tqdm(train_loader) as t:
        for x, y in t:
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            total_acc += (logits.argmax(1) == y).sum().item()
            total_count += y.size(0)
            total_loss += loss.item()
            optimizer.step()
            t.set_postfix({'loss': total_loss/total_count, 'acc': total_acc/total_count})

    model.eval()
    y_pred = []
    y_true = []
    with tqdm.tqdm(valid_loader) as t:
        for x, y in t:
            logits = model(x)
            total_acc += (logits.argmax(1) == y).sum().item()
            total_count += len(y)
            y_pred += logits.argmax(1).tolist()
            y_true += y.tolist()
    print(classification_report(y_true, y_pred))
    print("\n\n")
    print(confusion_matrix(y_true, y_pred))

epoch 1


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 123.20it/s, loss=0.0982, acc=0.281]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 529.53it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.12      0.01      0.01       295
           1       0.00      0.00      0.00       198
           2       0.33      0.25      0.28       508
           3       0.28      0.02      0.04       523
           4       0.26      0.87      0.40       476

    accuracy                           0.28      2000
   macro avg       0.20      0.23      0.15      2000
weighted avg       0.24      0.28      0.18      2000




[[  2   0  67   7 219]
 [  1   0  50   0 147]
 [  5   0 126  16 361]
 [  3   0  87  10 423]
 [  6   0  55   3 412]]
epoch 2


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 141.25it/s, loss=0.0894, acc=0.392]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 534.70it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.36      0.15      0.21       295
           1       0.00      0.00      0.00       198
           2       0.31      0.35      0.33       508
           3       0.30      0.56      0.39       523
           4       0.47      0.33      0.39       476

    accuracy                           0.34      2000
   macro avg       0.29      0.28      0.26      2000
weighted avg       0.32      0.34      0.31      2000




[[ 45   0 116 104  30]
 [ 15   0  89  75  19]
 [ 30   0 177 263  38]
 [ 22   0 120 294  87]
 [ 14   0  76 230 156]]
epoch 3


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 138.02it/s, loss=0.0781, acc=0.495]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 532.71it/s]


              precision    recall  f1-score   support

           0       0.39      0.15      0.22       295
           1       0.00      0.00      0.00       198
           2       0.29      0.79      0.42       508
           3       0.34      0.15      0.21       523
           4       0.57      0.31      0.40       476

    accuracy                           0.34      2000
   macro avg       0.32      0.28      0.25      2000
weighted avg       0.36      0.34      0.29      2000




[[ 45   0 227  14   9]
 [ 15   0 169   9   5]
 [ 19   0 402  63  24]
 [ 13   0 355  80  75]
 [ 22   1 233  71 149]]
epoch 4


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 137.92it/s, loss=0.0649, acc=0.595]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 534.70it/s]


              precision    recall  f1-score   support

           0       0.28      0.45      0.35       295
           1       0.23      0.12      0.15       198
           2       0.33      0.32      0.33       508
           3       0.29      0.19      0.23       523
           4       0.42      0.51      0.46       476

    accuracy                           0.33      2000
   macro avg       0.31      0.32      0.30      2000
weighted avg       0.32      0.33      0.32      2000




[[134  26  73  27  35]
 [ 70  23  57  21  27]
 [122  32 165  97  92]
 [ 91  14 133  99 186]
 [ 60   5  72  94 245]]
epoch 5


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 141.22it/s, loss=0.0493, acc=0.718]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 535.99it/s]


              precision    recall  f1-score   support

           0       0.31      0.34      0.33       295
           1       0.21      0.26      0.23       198
           2       0.33      0.31      0.32       508
           3       0.34      0.39      0.36       523
           4       0.50      0.38      0.43       476

    accuracy                           0.35      2000
   macro avg       0.34      0.34      0.33      2000
weighted avg       0.36      0.35      0.35      2000




[[100  58  62  51  24]
 [ 44  52  62  27  13]
 [ 79  72 158 157  42]
 [ 53  48 116 202 104]
 [ 42  17  76 160 181]]
epoch 6


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 132.51it/s, loss=0.0345, acc=0.818]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 537.19it/s]


              precision    recall  f1-score   support

           0       0.32      0.32      0.32       295
           1       0.30      0.09      0.13       198
           2       0.32      0.54      0.40       508
           3       0.31      0.11      0.17       523
           4       0.42      0.52      0.47       476

    accuracy                           0.35      2000
   macro avg       0.33      0.32      0.30      2000
weighted avg       0.34      0.35      0.32      2000




[[ 95  16 130  10  44]
 [ 44  17 102   6  29]
 [ 71  13 276  56  92]
 [ 49   7 236  60 171]
 [ 38   3 127  61 247]]
epoch 7


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 141.04it/s, loss=0.0227, acc=0.894]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 534.75it/s]


              precision    recall  f1-score   support

           0       0.31      0.35      0.33       295
           1       0.20      0.07      0.10       198
           2       0.32      0.49      0.39       508
           3       0.32      0.20      0.25       523
           4       0.45      0.47      0.46       476

    accuracy                           0.35      2000
   macro avg       0.32      0.32      0.31      2000
weighted avg       0.34      0.35      0.33      2000




[[103  20 115  23  34]
 [ 53  14  92  14  25]
 [ 75  23 250  84  76]
 [ 59  10 208 107 139]
 [ 39   4 109 102 222]]
epoch 8


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 144.16it/s, loss=0.0131, acc=0.954]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 531.80it/s]


              precision    recall  f1-score   support

           0       0.33      0.32      0.32       295
           1       0.26      0.11      0.16       198
           2       0.33      0.39      0.36       508
           3       0.33      0.44      0.38       523
           4       0.52      0.35      0.42       476

    accuracy                           0.36      2000
   macro avg       0.35      0.32      0.33      2000
weighted avg       0.37      0.36      0.35      2000




[[ 93  24  96  61  21]
 [ 45  22  76  40  15]
 [ 65  21 200 184  38]
 [ 47  12 150 230  84]
 [ 33   5  85 185 168]]
epoch 9


100%|███████████████████████████████████████████████████████| 188/188 [00:01<00:00, 135.59it/s, loss=0.0089, acc=0.967]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 538.82it/s]


              precision    recall  f1-score   support

           0       0.34      0.28      0.31       295
           1       0.21      0.18      0.20       198
           2       0.31      0.56      0.40       508
           3       0.34      0.27      0.30       523
           4       0.59      0.30      0.40       476

    accuracy                           0.35      2000
   macro avg       0.36      0.32      0.32      2000
weighted avg       0.38      0.35      0.34      2000




[[ 83  40 137  22  13]
 [ 31  36 109  15   7]
 [ 54  50 287  98  19]
 [ 39  32 247 143  62]
 [ 38  13 142 138 145]]
epoch 10


100%|██████████████████████████████████████████████████████| 188/188 [00:01<00:00, 139.29it/s, loss=0.00522, acc=0.987]
100%|███████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 536.93it/s]

              precision    recall  f1-score   support

           0       0.27      0.46      0.34       295
           1       0.23      0.18      0.20       198
           2       0.32      0.24      0.28       508
           3       0.32      0.34      0.33       523
           4       0.46      0.39      0.42       476

    accuracy                           0.33      2000
   macro avg       0.32      0.32      0.31      2000
weighted avg       0.34      0.33      0.33      2000




[[135  37  54  42  27]
 [ 72  36  45  28  17]
 [131  49 123 149  56]
 [ 97  29 100 177 120]
 [ 68   9  60 152 187]]





Deep learning are full of tricks. 

In the second example above, the CNN baseline is even not good enough to beat the TFIDF+Logistic regression baseline.

You can use all the techniques introduced in the lectures and tutorials to enhance your methods.

Of course, you can try any other ideas to make your model distinguished.

Also, if you want to use pre-trained models. here are some reference content:
1. https://huggingface.co/docs/transformers/tasks/sequence_classification
2. https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification