In [1]:
!nvida-smi

'nvida-smi' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np

import transformers
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

In [None]:
sns.set(style = 'whitegrid', palette = 'muted', font_scale = 1.3)
rcParams['figure.figsize'] = 13, 7

random_seed = 777
np.random.seed(random_seed)
torch.manual_seed(random_seed)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
df = pd.read_csv('17SDGpt2_tweets.csv', lineterminator='\n', index_col = 0)

In [None]:
sns.countplot(df.sdg)
plt.xlabel('sdg')

In [None]:
###Data Preprocessing 

In [None]:
#Pretrained Model 
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
sample_txt = 'Sustained and inclusive economic growth can drive progress, create decent jobs for all and improve living standards.'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
print(f"Sentence : {sample_txt}")
print(f"Tokens : {tokens}")
print(f"Token IDs : {token_ids}")

In [None]:
encoding = tokenizer.encode_plus(sample_txt,
                                max_length = 32,
                                add_special_tokens = True,
                                truncation = True,
                                return_token_type_ids = False,
                                padding = True,
                                return_attention_mask = True,
                                return_tensors = 'pt')

In [None]:
encoding.keys()

In [None]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
###Choosing a sequence Length 

In [None]:
import re

def clean_text(df):
    df = df.copy().reset_index(drop = True)
    df = df.apply(lambda x: re.sub(r"http\S+", "", x), 1)\
.apply(lambda i: " ".join(filter(lambda x:x[0]!="@", i.split())), 1)\
.apply(lambda x: re.sub(r"&amp", "",x),1)\
.apply(lambda x: re.sub(r"&amp;", "",x),1)
    return df

In [None]:
df['clean_tweet'] = clean_text(df['tweet'])

In [None]:
df['clean_tweet'] = df['clean_tweet'].drop_duplicates()

In [None]:
df = df[~df['clean_tweet'].isnull()]

In [None]:
df = df[['sdg', 'clean_tweet']]

In [None]:
token_lens = []
for tweet in df['clean_tweet']:
    tokens = tokenizer.encode(tweet, max_length = 200, truncation = True)
    token_lens.append(len(tokens))
sns.distplot(token_lens)
plt.xlim([0,250])
plt.xlabel(['Token Count'])

In [None]:
MAX_LEN = 120

In [None]:
class sdgdataset(Dataset):
    def __init__(self, tweets, sdg, tokenizer, max_len):
        self.tweets = tweets
        self.sdg = sdg
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    
    def __len__(self):
        return len(self.tweets)
    
    
    def __getitem__(self, item):
        tweets = str(self.tweets[item])
        sdg = self.sdg[item]
        
        encoding = self.tokenizer.encode_plus(
        tweets,
        add_special_tokens = True,
        max_length = self.max_len,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt')
        
        return {'tweet_text': tweet,
               'input_ids': encoding['input_ids'].flatten(),
               'attention_mask': encoding['attention_mask'].flatten(),
               'sdg': torch.tensor(sdg, dtype = torch.long)
               }

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.4,
                                     random_state = random_seed)

df_val, df_test = train_test_split(df_test, test_size = 0.5,
                                   random_state = random_seed)

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = sdgdataset(tweets = df.clean_tweet.to_numpy(),
                        sdg = df.sdg.to_numpy(),
                        tokenizer = tokenizer,
                        max_len = max_len)
    
    return DataLoader(ds,
                     batch_size = batch_size,
                     num_workers = 4)

In [None]:
BATCH_SIZE = 16

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

In [None]:

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits