In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
df = pd.read_csv('17SDGpt2_tweets.csv', lineterminator='\n', index_col = 0)

In [None]:
def clean_text(df):
    import re
    df = df.copy().reset_index(drop = True)
    df = df.apply(lambda x: re.sub(r"http\S+", "", x), 1)\
.apply(lambda i: " ".join(filter(lambda x:x[0]!="@", i.split())), 1)\
.apply(lambda x: re.sub(r"&amp", "",x),1)\
.apply(lambda x: re.sub(r"&amp;", "",x),1)
    return df

In [None]:
df['clean_tweet'] = clean_text(df['tweet'])
df['clean_tweet'] = df['clean_tweet'].drop_duplicates()
df = df[~df['clean_tweet'].isnull()]
df = df[['sdg', 'clean_tweet']]

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.4,
                                     random_state = random_seed)

df_val, df_test = train_test_split(df_test, test_size = 0.5,
                                   random_state = random_seed)

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
df_train['clean_tweet'].apply(lambda s: len(s.split())).describe()

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
#Pretrained Model 
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:

token_lens = []

for txt in df['clean_tweet']:
    tokens = tokenizer.encode(txt, max_length=120)
    token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 120]);
plt.xlabel('Token count');

In [None]:
MAX_LEN = 120

In [None]:
class GPReviewDataset(Dataset):
    
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
    )

        return {
          'review_text': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
    reviews=df.clean_tweet.to_numpy(),
    targets=df.sdg.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
  )

In [None]:
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)


In [None]:

class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
F.softmax(model(input_ids, attention_mask), dim=1)