In [1]:
%%capture
!pip install pytorch-lightning transformers lightning

In [2]:
import pandas as pd
import numpy as np
import importlib

In [3]:
!git clone https://github.com/jackboyla/sentiment-analysis.git
import os
os.chdir('/kaggle/working/sentiment-analysis')

Cloning into 'sentiment-analysis'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 65 (delta 28), reused 55 (delta 21), pack-reused 0[K
Receiving objects: 100% (65/65), 42.89 KiB | 3.30 MiB/s, done.
Resolving deltas: 100% (28/28), done.


In [4]:

DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
df = pd.read_csv(
    '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',
    encoding=DATASET_ENCODING,
    names=DATASET_COLUMNS
    )

In [5]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
print("Dataset size:", len(df))

Dataset size: 1600000


In [7]:
df['target'].unique()

array([0, 4])

### Preprocessing

In [8]:
decode_map = {0: 0, 4: 1}
def binarize_sentiment(label):
    return decode_map[int(label)]

df['target'] = df.target.apply(lambda x: binarize_sentiment(x))
df['target'].unique()

array([0, 1])

In [9]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, 
                                     train_size=0.8, 
                                     test_size=0.2, 
                                     random_state=42, stratify=df['target'])
train_df, val_df = train_test_split(train_df, 
                                    test_size=0.3, 
                                    random_state=42, stratify=train_df['target'])

train_df.reset_index(inplace=True, drop=True)
val_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [10]:
import importlib
from torch.utils.data import DataLoader
import src.dataflow as dataflow
import src.model as models

import pytorch_lightning as L

In [11]:

importlib.reload(dataflow)
importlib.reload(models)

from transformers import CanineTokenizer

# single sequence: [CLS] X [SEP]
tokenizer = CanineTokenizer.from_pretrained("google/canine-c", 
                                            return_tensors='pt')

# Build Datasets
train_dataset = dataflow.TweetDataset(train_df['text'], train_df['target'], tokenizer)
val_dataset = dataflow.TweetDataset(val_df['text'], val_df['target'], tokenizer)
test_dataset = dataflow.TweetDataset(test_df['text'], test_df['target'], tokenizer)


# Build DataLoaders
BATCH_SIZE = 32
SHUFFLE = False
train_dataloader = DataLoader(train_dataset, 
                              batch_size=BATCH_SIZE, 
                              shuffle=SHUFFLE, 
                              collate_fn=lambda b: dataflow.collate_fn(b, input_pad_token_id=tokenizer.pad_token_id)
                              )
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=lambda b: dataflow.collate_fn(b, input_pad_token_id=tokenizer.pad_token_id))
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=lambda b: dataflow.collate_fn(b, input_pad_token_id=tokenizer.pad_token_id))

# sample, labels = next(iter(train_dataloader))
# print(sample)
# print(labels)
# sample['input_ids'].shape

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [12]:
importlib.reload(dataflow)
importlib.reload(models)
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, ModelSummary, DeviceStatsMonitor
from lightning.pytorch.profilers import AdvancedProfiler

print(f"Training on {len(train_df)} examples...")
NUM_EPOCHS = 100

early_stop = EarlyStopping('val_loss', patience=5, verbose=True, min_delta=0.05)
checkpoint_callback = ModelCheckpoint(save_top_k=2, monitor="val_loss")

trainer = L.Trainer(max_epochs=NUM_EPOCHS, 
                    accelerator = 'gpu',
                    profiler = 'simple',
                    callbacks=[ early_stop, checkpoint_callback, DeviceStatsMonitor()])

classifier = models.SentimentClassifier(tokenizer=tokenizer, freeze_encoder=False) # True
trainer.fit(classifier, train_dataloader, val_dataloader)

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


Training on 896000 examples...


Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/529M [00:00<?, ?B/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [13]:
trainer.test(classifier, test_dataloader)

Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.4341098666191101}]

In [14]:
importlib.reload(dataflow)
importlib.reload(models)

# load checkpoint
checkpoint = checkpoint_callback.best_model_path 
m = models.SentimentClassifier.load_from_checkpoint(checkpoint, tokenizer=tokenizer)

# choose your trained nn.Module
m.eval()

text = "@elonmusk is so empathetic and level-headed!!"
print(f"{text} \n-> {tokenizer.decode(tokenizer(text)['input_ids'])} \n-> Sentiment: {m(text)}")

text = "sheep are really dumb"
print(f"{text} \n-> {tokenizer.decode(tokenizer(text)['input_ids'])} \n-> Sentiment: {m(text)}")

@elonmusk is so empathetic and level-headed!! 
-> [CLS]@elonmusk is so empathetic and level-headed!![SEP] 
-> Sentiment: [1]
sheep are really dumb 
-> [CLS]sheep are really dumb[SEP] 
-> Sentiment: [0]


In [15]:
for tweet in df['text'][-50:]:
    print(f"{tweet} : PRED -> {m(tweet)}")

OMG how good is ben and jerrys cookie dough icecream...come one really well goood lol, justwaitin to have a BBQ hope its stays like this  : PRED -> [1]
oooo haha just waking up and ready to eat a delicious breakfast and prepared to go in the afternoon to watch a movie  : PRED -> [1]
#Traveltuesday @GuyNGirlTravels Because their tweets are hilarious LOL and they're great travelers  : PRED -> [1]
any ideaZ on what to get dad for father's day ? No socks tho  : PRED -> [0]
God works mysteriously!i learn that if u think of the world wonderfully you will receive more &amp; get what u ask for  : PRED -> [1]
@_CrC_ mornin.. I'm enjoying a beautiful morning here in Phoenix. not too bad out yet  : PRED -> [1]
Woke up feeling rested and refreshed today! It's about time  : PRED -> [1]
@naijagal You just HAD to throw that in. Tell her that I say thanks for the eye contact at her Brixton gig. managed to get a shot!  : PRED -> [1]
@siovene lol I don't blame you it's not the safest thing in the world 