In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import transformers
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, classification_report

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
from pylab import rcParams
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers,scikit-learn

In [None]:
sns.set(style = 'whitegrid', palette = 'muted', font_scale = 1.3)
rcParams['figure.figsize'] = 14, 9

random_seed = 777
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
pl.seed_everything(random_seed)

In [None]:
##Loading in Data

In [None]:
df = pd.read_csv('17SDGpt2_tweets.csv', lineterminator='\n', index_col = 0)

In [None]:
import re

def clean_text(df):
    df = df.copy().reset_index(drop = True)
    df = df.apply(lambda x: re.sub(r"http\S+", "", x), 1)\
.apply(lambda i: " ".join(filter(lambda x:x[0]!="@", i.split())), 1)\
.apply(lambda x: re.sub(r"&amp", "",x),1)\
.apply(lambda x: re.sub(r"&amp;", "",x),1)
    return df

In [None]:
df['clean_tweet'] = clean_text(df['tweet'])
df['clean_tweet'] = df['clean_tweet'].drop_duplicates()
df = df[~df['clean_tweet'].isnull()]
df = df[['sdg', 'clean_tweet']]

In [None]:
train_df, val_df = train_test_split(df, test_size = 0.4)
train_df.shape, val_df.shape

In [None]:
sns.countplot(df['sdg'])
plt.xlabel('sdg')

In [None]:
###Tokenization

In [None]:
#Pretrained Model 
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
sample_row = df.iloc[77777]
sample_tweet = sample_row['clean_tweet']
sample_label = sample_row['sdg']
print(sample_tweet, sample_label)

In [None]:
encoding = tokenizer.encode_plus(sample_tweet,
                                max_length = 150,
                                add_special_tokens = True,
                                truncation = True,
                                return_token_type_ids = False,
                                padding = True,
                                return_attention_mask = True,
                                return_tensors = 'pt')
encoding.keys()

In [None]:
encoding['input_ids'].squeeze()[:30]

In [None]:
encoding['attention_mask'].squeeze()[:30]

In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'].squeeze()[:30]))

In [None]:
token_counts = []

for _, row in train_df.iterrows():
    token_count = len(tokenizer.encode(
    row['clean_tweet'],
    max_length = 150,
    truncation = True))
    token_counts.append(token_count)

In [None]:
sns.histplot(token_counts)
plt.xlim([0,180])

In [None]:
MAX_TOKEN_COUNT = 100

In [None]:
class SdgTweetDataset(Dataset):
    
    def __init__(self, tweets, sdgs, tokenizer, max_len):
        self.tweets = tweets
        self.sdgs = sdgs
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    
    def __len__(self):
        return len(self.tweets)
    
    
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        sdg = self.sdgs[item]
        
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens = True,
            max_length = self.max_len,
            return_token_type_ids = False,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt')
        
        return {'tweet_text': tweet,
               'input_ids': encoding['input_ids'].flatten(),
               'attention_mask': encoding['attention_mask'].flatten(),
               'sdg': torch.tensor(sdg, dtype = torch.long)
               }

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.4,
                                     random_state = random_seed)

df_val, df_test = train_test_split(df_test, test_size = 0.5,
                                   random_state = random_seed)

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = SdgTweetDataset(
        tweets=df.clean_tweet.to_numpy(),
        sdgs = df.sdg.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
      )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
      )

In [None]:
BATCH_SIZE = 16

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

In [None]:

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits