<h3>Imports

In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
import statistics
import random
import re
import spacy
import typing
from typing import List, Tuple, Dict
from transformers import CanineTokenizer, CanineModel, CanineForSequenceClassification
from datasets import Dataset, DatasetDict
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
spacy.__version__

'3.7.5'

<h3>Work directory

In [25]:
# directory path
directory = '/Users/egemenipek/projects/aylien-science-challenge-master'

# Check if the directory exists
if os.path.exists(directory):
    # Change the current working directory to the specified directory
    os.chdir(directory)
    print(f"Changed working directory to '{directory}'")
else:
    print(f"Directory '{directory}' does not exist")

Changed working directory to '/Users/egemenipek/projects/aylien-science-challenge-master'


<h1>Prepare Training Data

In [26]:
# this function reads a .txt file
# it is meant to work with on a document-per-line basis as in the sample data files provided
def read_data(path: str) -> List[str]:
    with open(path, 'r') as f:
        text = f.read()
        documents = text.split('\n')
        return documents

In [29]:
positive = read_data("data/positive")
neutral = read_data('data/neutral')
negative = read_data('data/negative')

In [30]:
# observing data distribution
print(f'Positive Examples:{len(positive)}\nNeutral Examples:{len(neutral)}\nNegative Examples: {len(negative)}')

Positive Examples:2364
Neutral Examples:3100
Negative Examples: 9179


There is significant class imbalance towards Negative examples. Neutral examples are, although not as significant as the class Negative, still have significantly more examples than the positive class.
<p> It makes sense here to over-sample less represented classes and under-sample the more represented class into the standard deviation value of n_examples across three of the classes.

In [31]:
n_examples = [len(positive), len(neutral), len(negative)]
std_dev = int(statistics.stdev(n_examples)) #std stands for standard deviation I'm not trying to tell a story here :D

In [32]:
def over_sample(target_n_examples: int, data: List[str]) -> List[str]:
    current_n_examples = len(data) #the number of examples in the dataset
    required_n_examples = target_n_examples - current_n_examples #required number of data examples required to oversample to reach standard deviation
    random_idxs = [random.randint(0, current_n_examples-1) for i in range(required_n_examples)] #get random indicies for oversampling
    required_examples = [data[i] for i in random_idxs] # the list of oversample data to be merged into the existing data
    oversampled_data = data + required_examples
    return oversampled_data


In [33]:
def under_sample(target_n_examples: int,  data: List[str]) -> List[str]:
    current_n_examples = len(data) #the number of examples in the data
    random_idxs = [random.randint(0, current_n_examples-1) for i in range(target_n_examples)] #get random indecies to include in the data
    undersampled_data = [data[i] for i in random_idxs] #make new data with the random indicies
    return undersampled_data


Apply oversampling and undersampling to the classes Positive, Neutral, and Negative

In [34]:
positive_std = over_sample(std_dev, positive)
neutral_std = over_sample(std_dev, neutral)
negative_std = under_sample(std_dev, negative)

In [35]:
# observing data distribution
print(f'Positive Examples:{len(positive_std)}\nNeutral Examples:{len(neutral_std)}\nNegative Examples: {len(negative_std)}')

Positive Examples:3740
Neutral Examples:3740
Negative Examples: 3740


<h3>Text Preprocessing

<p>Preprocessing is going to be essential since the model is character-level

Pre-processing strats:
* Numbers out -done
* Stopwords out -done
* Punctuation out -done
* URLs need to go -done
* Userhandles out - done
* Lemmatize the tokens -done
* Remove named entities - done

In [36]:
# use spacy to remove stopwords
nlp = spacy.load('en_core_web_sm')

In [37]:
# this function removes numbers from a list of strings
def remove_numbers(data: List[str]) -> List[str]:
    clean_data = []
    pattern = r'\d+'
    for document in data:
        document = re.sub(pattern, '', document)
        clean_data.append(document)
    return clean_data

In [38]:
# this function removes handle content from documents example: @someuser
def remove_handle_content(data: List[str]) -> List[str]:
    clean_data = []
    for document in data:
        document = ' '.join([token for token in document.split() if '@' not in token])
        clean_data.append(document)
    return clean_data

In [39]:
# this function removes url content from documents example: http//:someurl
def remove_url_content(data: List[str]) -> List[str]:
    clean_data = []
    for document in data:
        document = ' '.join([token for token in document.split() if token[:4]!='http'])
        clean_data.append(document)
    return clean_data

In [40]:
# this function removes stopwords and punctuation from documents example: the !, etc.
def remove_stopwords_punctuation(data: List[str]) -> List[str]:
    clean_data = []
    for document in data:
        doc = nlp(document)
        document = ' '.join([token.text for token in doc if not token.is_stop and not token.is_punct])
        clean_data.append(document)
    return clean_data

In [41]:
# this function removed named entities form the example
def remove_entities(data: List[str]) -> List[str]:
    clean_data = []
    for document in data:
        doc = nlp(document)
        document = ' '.join([token.text for token in doc if not token.ent_type_])
        clean_data.append(document)
    return clean_data

In [42]:
# this function lemmatizes tokens in the documents inside the data example: going -> go # this also lowercases letters
# this function also removes duplicate tokens
def lemmatize_documents(data: List[str]) -> List[str]:
    clean_data = []
    for document in data:
        doc = nlp(document)
        document = ' '.join([token.lemma_.lower() for token in doc])
        clean_data.append(document)
    return clean_data

In [43]:
# this function filters tokens based on designated POS tags I decided not to use it
def filter_pos(data: List[str]) -> List[str]:
    pos_tags_to_keep = ["ADJ", "ADV", "INTJ", "VERB"]
    clean_data = []
    for document in data:
        doc = nlp(document)
        document = ' '.join([token.text for token in doc if token.pos_ in pos_tags_to_keep])
        clean_data.append(document)
    return clean_data

Apply text preprocessing functions to existing data

In [44]:
positive_clean = lemmatize_documents(remove_entities(remove_stopwords_punctuation(remove_url_content(remove_handle_content(remove_numbers(positive_std))))))
neutral_clean = lemmatize_documents(remove_entities(remove_stopwords_punctuation(remove_url_content(remove_handle_content(remove_numbers(neutral_std))))))
negative_clean = lemmatize_documents(remove_entities(remove_stopwords_punctuation(remove_url_content(remove_handle_content(remove_numbers(negative_std))))))

In [45]:
idx = 230
print('Positive:')
print(positive_std[idx],'\n', positive_clean[idx])
print('Neutral:')
print(neutral_std[idx],'\n', neutral_clean[idx])
print('Negative:')
print(negative_std[idx],'\n', negative_clean[idx])

Positive:
@united that's brilliant Thankyou so much. Is it classed as part of carryon? 
 brilliant classed carryon
Neutral:
@united DM sent 
 dm send
Negative:
@USAirways stuck on the ramp at DCA, US Air computer system crashed...everywhere. 
 stick ramp computer system crash


<h3>Populate dataset

In [46]:
# this function takes in a list of examples and pairs it with a label integer to make a tuple(str, int)
def add_label(data: List[str], label: int) -> Tuple[str, int]:
    text_label_pair = []
    for example in data:
        example = (example, label)
        text_label_pair.append(example)
    return text_label_pair

In [47]:
# put data together into a single object
positive_data = add_label(positive_clean, 0)
neutral_data = add_label(neutral_clean, 1)
negative_data = add_label(negative_clean, 2)
dataset = positive_data + neutral_data + negative_data

In [48]:
print(len(positive_data), len(neutral_data), len(negative_data))
print(len(positive_data) + len(neutral_data) + len(negative_data), len(dataset))
print(dataset[8000])

3740 3740 3740
11220 11220
('try book flight amp site 😁', 2)


In [49]:
# make a dataframe for the train_test_split function from sklearn and to save the dataset
dataset_df = pd.DataFrame(dataset, columns=['text', 'label'])
dataset_df.head(2)

Unnamed: 0,text,label
0,plus add commercial experience tacky,0
1,yes nearly time fly ear worm will away,0


In [50]:
# load data from the dataframe into the huggingface dataset object
ds = Dataset.from_pandas(dataset_df)
ds

Dataset({
    features: ['text', 'label'],
    num_rows: 11220
})

<h1>Create Sentiment Classifier

We don't have data to pre-train a transformer encoder stack. Therefore we only can opt for a pre-trained model.
<p>I've found this character-based transformer from Google named Canine-s, here's the model card: https://huggingface.co/google/canine-s

<p>We will add a linear layer (the classifier head) to the base model and fine-tune it with our data.

<h3>Google Canine-s

In [28]:
# this model creates 2048 dimensional tensors
tokenizer = CanineTokenizer.from_pretrained('google/canine-s') #load it's tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [29]:
# this is a tokenizer function, this is how huggingface recommends tokenizing examples
def tokenizer_fn(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

In [30]:
# recommended implementation of the tokenizer function
ds_tokenized = ds.map(tokenizer_fn)

Map:   0%|          | 0/11220 [00:00<?, ? examples/s]

In [31]:
#observe new columns
print(ds_tokenized.column_names)

['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']


In [32]:
# need to take the 'text' column out because the model will give an error otherwise. It doesn't expenct this column
# it is very important that the column for labels is named 'labels' for the same reason
ds_tokenized = ds_tokenized.remove_columns(['text'])
ds_tokenized = ds_tokenized.rename_column('label', 'labels')
ds_tokenized.set_format('torch') #set tensor format to torch because we'll be using torch in this project
print(ds_tokenized.column_names)

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [33]:
# split the dataset to train, val, test
# first we split the test and train+val subsets and then we split train+val into train and val subsets
ds_traintest = ds_tokenized.train_test_split(test_size=0.1)
ds_trainval = ds_traintest['train'].train_test_split(test_size=0.2)
ds_train = ds_trainval['train']
ds_val = ds_trainval['test']
ds_test = ds_traintest['test']  

In [34]:
# observe datasets
print(ds_train, ds_val, ds_test)

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8078
}) Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2020
}) Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1122
})


In [54]:
# make dataloaders from our datasets
train_dataloader = DataLoader(ds_train, batch_size=4, shuffle=True)
val_dataloader = DataLoader(ds_val, batch_size=4, shuffle=True)
test_dataloader = DataLoader(ds_test, batch_size=4, shuffle=True)

In [37]:
sentiment_classifier = CanineForSequenceClassification.from_pretrained('google/canine-s', num_labels=3)

Some weights of CanineForSequenceClassification were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# initialize the model and make general settings
#sentiment_classifier = Classifier()
num_epochs = 1
num_iterations = num_epochs * ds_train.num_rows
optimizer = optim.AdamW(sentiment_classifier.parameters(), lr=2e-5)
device = ('cuda' if torch.cuda.is_available() else 'cpu')
sentiment_classifier.to(device)
loss_fn = nn.CrossEntropyLoss()
print(device)

cuda


In [39]:
softmax = nn.Softmax(dim=1)

In [47]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_iterations))

for epoch in range(num_epochs):
    sentiment_classifier.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch_labels = batch['labels'].to(device)
        outputs = sentiment_classifier(**batch)
        logits = outputs.logits
        #print(logits)
        loss = loss_fn(logits, batch_labels)
        loss.backward()

        optimizer.step()

        optimizer.zero_grad()   
        progress_bar.update(1)

    sentiment_classifier.eval()
    y_preds = []
    ys = []
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch_labels = batch['labels']
        with torch.no_grad():
            outputs = sentiment_classifier(**batch)
            logits = outputs.logits
            #print(logits)
            #print(logits.shape)
            #print(logits.device)
            #print(logits.dtype)
            batch_probs = softmax(logits)
            batch_preds = torch.argmax(batch_probs, dim=1)
            #print(batch_preds)
        for pred in batch_preds:
            y_preds.append(int(pred.item()))
        for label in batch_labels:
            ys.append(int(label.item()))

    accuracy = accuracy_score(y_preds, ys)
    precision = precision_score(y_preds, ys, average='macro')
    recall = recall_score(y_preds, ys, average='macro')
    f1_macro = f1_score(y_preds, ys, average='macro')
    f1_micro = f1_score(y_preds, ys, average='micro')
    print(f'Epoch {epoch+1}: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Macro: {f1_macro}, F1 Micro: {f1_micro}')



  0%|          | 0/8078 [00:00<?, ?it/s]

Epoch 1: Accuracy: 0.9535776182223322, Precision: 0.9536551972199313, Recall: 0.9537904891681773, F1 Macro: 0.9535690954592821, F1 Micro: 0.9535776182223322


In [68]:
# testing
model.to(device)
model.eval()
for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch_labels = batch['labels']
        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs.logits
            #print(logits)
            #print(logits.shape)
            #print(logits.device)
            #print(logits.dtype)
            batch_probs = softmax(logits)
            batch_preds = torch.argmax(batch_probs, dim=1)
            #print(batch_preds)
        for pred in batch_preds:
            y_preds.append(int(pred.item()))
        for label in batch_labels:
            ys.append(int(label.item()))

accuracy = accuracy_score(y_preds, ys)
precision = precision_score(y_preds, ys, average='macro')
recall = recall_score(y_preds, ys, average='macro')
f1_macro = f1_score(y_preds, ys, average='macro')
f1_micro = f1_score(y_preds, ys, average='micro')
print(f'Epoch {epoch+1}: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Macro: {f1_macro}, F1 Micro: {f1_micro}')

Epoch 1: Accuracy: 0.9159647404505387, Precision: 0.915980373907115, Recall: 0.9163266794115629, F1 Macro: 0.9158494095465145, F1 Micro: 0.9159647404505387


In [65]:
sentiment_classifier.save_pretrained("canine-s-classifier/huggingface/sentiment_classifier")
tokenizer.save_pretrained('canine-s-classifier/huggingface/sentiment_classifier/tokenizer')

In [54]:
model = CanineForSequenceClassification.from_pretrained("sentiment_classifier/model")
type(model)

transformers.models.canine.modeling_canine.CanineForSequenceClassification