In [1]:
# Making necessary imports
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import random
import re
import gc
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading the airline data
data = pd.read_csv('Tweets.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
# Filtering the data since we need only the text and sentiment
data = pd.concat((data['text'],data['airline_sentiment']),axis=1)

In [5]:
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [6]:
# Fixing randomization seed for reproduceability 
def fix_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)

In [7]:
fix_seed(42)

In [8]:
# Label encoding of target labels
le = LabelEncoder()
data['airline_sentiment'] = le.fit_transform(data['airline_sentiment'])

In [9]:
data

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,1
1,@VirginAmerica plus you've added commercials t...,2
2,@VirginAmerica I didn't today... Must mean I n...,1
3,@VirginAmerica it's really aggressive to blast...,0
4,@VirginAmerica and it's a really big bad thing...,0
...,...,...
14635,@AmericanAir thank you we got on a different f...,2
14636,@AmericanAir leaving over 20 minutes Late Flig...,0
14637,@AmericanAir Please bring American Airlines to...,1
14638,"@AmericanAir you have my money, you change my ...",0


In [10]:
dfr = pd.DataFrame

In [11]:
# Class ID to Labels Dictionary
labels = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

In [12]:
len(labels)

3

In [13]:
# Label Distribution
data['airline_sentiment'].value_counts()

airline_sentiment
0    9178
1    3099
2    2363
Name: count, dtype: int64

In [14]:
# Pre-Processing Steps for Expanding Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not",
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [15]:
# Overall Data Cleaning such as removing links, removing special characters, removing extra spaces and lower casing the string 
def remove_links(text):
    return re.sub(r'http[s]?://\S+', '', text)

def clean_text(text):
    # Remove non-alphanumeric characters and extra whitespaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def clean_dfs(dfs):
    for df in dfs:
        df['text'] = df['text'].apply(remove_links)
        df['text'] = df['text'].apply(lambda x: expand_contractions(x))
        df['text'] = df['text'].apply(str.lower)
        df['text'] = df['text'].apply(clean_text)


In [16]:
# Cleaning the dataframe
clean_dfs([data])

In [17]:
# Creating the train test split : 80% Training and 20% Validation
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(data['text'], data['airline_sentiment'], test_size=0.2, random_state=42, stratify=data['airline_sentiment'])

In [18]:
# Importing the tokenizer and model class
from transformers import RobertaTokenizer, RobertaForSequenceClassification
model1 = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(labels), problem_type="multi_label_classification")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Freezing all the model layers
for param in model1.parameters():
    param.requires_grad = False

In [20]:
# Only training the classification projection head
for param in model1.classifier.parameters():
    param.requires_grad = True

In [21]:
# Defining the dataset class
class Data(Dataset):
    def __init__(self, text, label):
        self.text = text.reset_index(drop=True)
        self.label = label.reset_index(drop=True)
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        txt = self.text[index]
        lbl = self.label[index]
        encoding = self.tokenizer(txt, return_tensors="pt", padding='max_length', truncation = True)
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(lbl, dtype=torch.long)
        }

In [22]:
# Creating training and validation data objects
train_data = Data(text=x_train, label=y_train)
valid_data = Data(text=x_val, label=y_val)

In [None]:
# Defining the training and validation data loaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=16, shuffle=False)

In [26]:
# Checking the device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [27]:
# Clear CUDA cache
torch.cuda.empty_cache()

# Perform garbage collection
gc.collect()

92

In [28]:
# Training settings
model1.to(device)
epochs = 25
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model1.parameters(), lr = 2e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.01, min_lr=1e-6)
train_loss = 0
val_loss = 0
best_epoch = 0
stop_thresh = 7
best_val_loss = 1e+9
train_loss_lis1 = []
valid_loss_lis1 = []

In [29]:
# Training Loop
for epoch in tqdm(range(epochs)):
    
    model1.train()
    for batch in tqdm(train_loader):
        for k,v in batch.items():
            batch[k] = v.to(device)
        optimizer.zero_grad()
        logits = model1(input_ids = batch['input_ids'], attention_mask=batch['attention_mask']).logits
        loss = loss_fn(logits, batch['labels'])
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    print(train_loss)
    train_loss_lis1.append(train_loss)
    
    model1.eval()
    for batch in tqdm(valid_loader):
        for k,v in batch.items():
            batch[k] = v.to(device)
        with torch.no_grad():
            logits = model1(input_ids = batch['input_ids'], attention_mask=batch['attention_mask']).logits
            loss = loss_fn(logits, batch['labels'])
            val_loss += loss.item()
    val_loss /= len(valid_loader)
    valid_loss_lis1.append(val_loss)
    print(val_loss)
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_epoch = epoch
        best_val_loss = val_loss
        print("save model")
        torch.save(model1.state_dict(),"model_sentiment/best.pth")
    
    if epoch - best_epoch > stop_thresh:
        print("Early Stopping!")
        break

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/732 [00:00<?, ?it/s]

0.6975315469902396


  0%|          | 0/183 [00:00<?, ?it/s]

0.6457595900922525
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.6202287184109673


  0%|          | 0/183 [00:00<?, ?it/s]

0.49353612540102043
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.5976248390384861


  0%|          | 0/183 [00:00<?, ?it/s]

0.5929687709364265


  0%|          | 0/732 [00:00<?, ?it/s]

0.5996310017055594


  0%|          | 0/183 [00:00<?, ?it/s]

0.48494558863667675
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.5805223552895628


  0%|          | 0/183 [00:00<?, ?it/s]

0.49887906801140525


  0%|          | 0/732 [00:00<?, ?it/s]

0.5780122005566972


  0%|          | 0/183 [00:00<?, ?it/s]

0.4804439865115907
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.5834235312052255


  0%|          | 0/183 [00:00<?, ?it/s]

0.4792967235252525
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.5806640079684687


  0%|          | 0/183 [00:00<?, ?it/s]

0.49961029071582325


  0%|          | 0/732 [00:00<?, ?it/s]

0.5719041879800917


  0%|          | 0/183 [00:00<?, ?it/s]

0.47669178402591994
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.5879411934619919


  0%|          | 0/183 [00:00<?, ?it/s]

0.489069395142184


  0%|          | 0/732 [00:00<?, ?it/s]

0.5324008817106005


  0%|          | 0/183 [00:00<?, ?it/s]

0.46798795881209143
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.5296616881264241


  0%|          | 0/183 [00:00<?, ?it/s]

0.4645641805029272
save model


  0%|          | 0/732 [00:00<?, ?it/s]

0.5291621711735046


  0%|          | 0/183 [00:00<?, ?it/s]

0.4650560160973556


  0%|          | 0/732 [00:00<?, ?it/s]

0.5358304966562601


  0%|          | 0/183 [00:00<?, ?it/s]

0.4674930723505478


  0%|          | 0/732 [00:00<?, ?it/s]

0.52362746800163


  0%|          | 0/183 [00:00<?, ?it/s]

0.46537420780715033


  0%|          | 0/732 [00:00<?, ?it/s]

0.523196454389678


  0%|          | 0/183 [00:00<?, ?it/s]

0.4652689608573944


  0%|          | 0/732 [00:00<?, ?it/s]

0.5234832889212716


  0%|          | 0/183 [00:00<?, ?it/s]

0.46497727327314237


  0%|          | 0/732 [00:00<?, ?it/s]

0.5278599954234139


  0%|          | 0/183 [00:00<?, ?it/s]

0.4649697137168179


  0%|          | 0/732 [00:00<?, ?it/s]

0.526819897146741


  0%|          | 0/183 [00:00<?, ?it/s]

0.46494034749252716


  0%|          | 0/732 [00:00<?, ?it/s]

0.5257566958590552


  0%|          | 0/183 [00:00<?, ?it/s]

0.4648801229241107
Early Stopping!


In [30]:
# Loading the best model for combined predictions on the validation dataset
model1.load_state_dict(torch.load('/home/shreshthsharma/fin-para/model_sentiment/best.pth'))
model1.to(device)
preds1 = []
model1.eval()
for batch in tqdm(valid_loader):
    for k,v in batch.items():
        batch[k] = v.to(device)
    with torch.no_grad():
        logits = model1(input_ids = batch['input_ids'], attention_mask=batch['attention_mask']).logits
    pred = logits.cpu().argmax(dim=1)
    for lbl in pred:
        preds1.append(int(lbl))    

  0%|          | 0/183 [00:00<?, ?it/s]

In [31]:
# Creating the arrays for training loss, validation losses, and predictions
train_loss_arr = np.array(train_loss_lis1)
valid_loss_arr = np.array(valid_loss_lis1)
preds1_arr = np.array(preds1)

In [32]:
# Creating csv for reference
dfr(preds1_arr).to_csv('preds_airline.csv', index=False)
dfr(train_loss_arr).to_csv('train_loss_senti.csv', index=False)
dfr(valid_loss_arr).to_csv('valid_loss_senti.csv', index=False)

In [34]:
# Importing F1 Score for benchmarking
from sklearn.metrics import f1_score

In [35]:
# Macro F1 Score
f1_score(y_val, preds1_arr, average='macro')

0.7501181132583712

In [36]:
# Weighted F1 Score
f1_score(y_val, preds1_arr, average='weighted')

0.8095691983977907