In [1]:
# !pip install autocorrect
# !pip install sklearn

In [1]:
import os
import pandas as pd
from nltk import tokenize
import nltk
import matplotlib.pyplot as plt
import numpy as np
# nltk.download('punkt')

In [2]:
import unicodedata
from autocorrect import Speller

check = Speller(lang='en')
filter = ''.join([chr(i) for i in range(1, 32)])


def auto_correct(text):
    return check(text)


def clean_text(text):        
    text = text.translate(str.maketrans('', '', filter))
    text = unicodedata.normalize("NFKD", text)
    return text

In [3]:
base_path = "../feedback-challenge/feedback-prize-2021/"

train_path = os.path.join(base_path, "train.csv")

In [4]:
df = pd.read_csv(train_path)
df.sort_values(["id","discourse_start"], ignore_index=True, inplace=True)
df.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,0000D23A521A,1617735000000.0,0.0,170.0,"Some people belive that the so called ""face"" o...",Position,Position 1,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
1,0000D23A521A,1617735000000.0,170.0,357.0,"It was not created by aliens, and there is no ...",Evidence,Evidence 1,34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 4...
2,0000D23A521A,1617735000000.0,358.0,438.0,"A mesa is a naturally occuring rock formation,...",Evidence,Evidence 2,69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
3,0000D23A521A,1617735000000.0,438.0,626.0,"This ""face"" on mars only looks like a face bec...",Claim,Claim 1,84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 9...
4,0000D23A521A,1617735000000.0,627.0,722.0,Many conspiracy theorists believe that NASA is...,Counterclaim,Counterclaim 1,117 118 119 120 121 122 123 124 125 126 127 12...


In [5]:
data_df = df[["discourse_text", "discourse_type"]]
data_df.head()

Unnamed: 0,discourse_text,discourse_type
0,"Some people belive that the so called ""face"" o...",Position
1,"It was not created by aliens, and there is no ...",Evidence
2,"A mesa is a naturally occuring rock formation,...",Evidence
3,"This ""face"" on mars only looks like a face bec...",Claim
4,Many conspiracy theorists believe that NASA is...,Counterclaim


In [6]:
def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]


label_name = sorted(data_df["discourse_type"].unique())

encode_dict = { 
    label: ind for label, ind in zip(label_name, np.arange(len(label_name)))} # ['a', 'b', 'c', 'd', 'e', 'f', 'g'])}

data_df = data_df.applymap(lambda x : clean_text(x))  
data_df["encoded_label"] = data_df["discourse_type"].apply(lambda x: encode_cat(x))
data_df.head()

Unnamed: 0,discourse_text,discourse_type,encoded_label
0,"Some people belive that the so called ""face"" o...",Position,5
1,"It was not created by aliens, and there is no ...",Evidence,3
2,"A mesa is a naturally occuring rock formation,...",Evidence,3
3,"This ""face"" on mars only looks like a face bec...",Claim,0
4,Many conspiracy theorists believe that NASA is...,Counterclaim,2


In [7]:
def get_train_data(dataframe):
    train_data = []
    labels = []
    
    for _, data in dataframe.iterrows():
        sent = tokenize.sent_tokenize(data["discourse_text"])
        train_data.extend(sent)
        labels.extend(len(sent) * [data["encoded_label"]])
        assert len(train_data) == len(labels)
        
    return train_data, labels


# train_data, labels = get_train_data(data_df)

In [8]:
def split(data, labels):
    from sklearn.model_selection import train_test_split    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_train, X_test, X_val, y_train, y_test, y_val


# X_train, X_test, X_val, y_train, y_test, y_val = split(train_data, labels)

In [18]:
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
# device = "cpu"

In [10]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [11]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sent = str(self.data.discourse_text[index])
        sent = " ".join(sent.split())
        inputs = self.tokenizer.encode_plus(
            sent,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.encoded_label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [12]:
data_df.encoded_label[8]

4

In [13]:
train_size = 0.8
train_dataset=data_df.sample(frac=train_size,random_state=200)
test_dataset=data_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(data_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (144293, 3)
TRAIN Dataset: (115434, 3)
TEST Dataset: (28859, 3)


In [14]:
# training_set = Triage(X_train, tokenizer, MAX_LEN)
# testing_set = Triage(X_test, tokenizer, MAX_LEN)

In [15]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [16]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.3)
#         self.classifier = torch.nn.Linear(768, 7)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        output = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.ReLU()(pooler)
#         pooler = self.dropout(pooler)
#         output = self.classifier(pooler)
        return output

In [19]:
model = DistillBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [20]:
import gc

In [21]:
for ind, data in enumerate(training_loader, 0):
    print(ind)
    gc.collect()
    torch.cuda.empty_cache()
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)
    outputs = model(ids, mask)
    print(outputs.shape)



0


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 3.95 GiB total capacity; 3.25 GiB already allocated; 44.81 MiB free; 3.27 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF