In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/let-s-see/__results__.html
/kaggle/input/let-s-see/submission.csv
/kaggle/input/let-s-see/__notebook__.ipynb
/kaggle/input/let-s-see/__output__.json
/kaggle/input/let-s-see/custom.css
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [4]:
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
import xgboost as xgb

import nltk
from nltk.corpus import stopwords

from transformers import BertTokenizer
from transformers import BertModel

import torch
import torch.nn as nn

In [None]:
train = pd.read_csv(r'../input/nlp-getting-started/train.csv')
test = pd.read_csv(r'../input/nlp-getting-started/test.csv')

In [None]:
#removing urls
def clean_text(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    txt = url.sub(r'',text)
    html=re.compile(r'<.*?>')
    txt = html.sub(r'',txt)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    txt = emoji_pattern.sub(r'', txt)
    table=str.maketrans('','',string.punctuation)
    txt = txt.translate(table)
    return txt

In [None]:
train["cleaned_text"] = train.text.apply(lambda x : clean_text(x))
test["cleaned_text"] = test.text.apply(lambda x : clean_text(x))

In [None]:
train.head()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
class CustDat(torch.utils.data.Dataset):
    def __init__(self , df , tokenizer , mode = "train"):
        self.df = df
        self.tokenizer = tokenizer
        self.mode = mode
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self , idx):
        txt = self.df.iloc[idx].cleaned_text
        tokens = tokenizer(txt , padding = "max_length" , 
                           max_length = 30 , truncation = True , 
                           return_tensors = "pt")
        if self.mode in ["train" , "val"]:
            target = self.df.iloc[idx].target
            return tokens["input_ids"] , tokens["attention_mask"] , tokens["token_type_ids"] , target
        else:
            idd = self.df.iloc[idx].id
            return tokens["input_ids"] , tokens["attention_mask"] , tokens["token_type_ids"] , idd

In [None]:
train , val = train_test_split(train , test_size = 0.2 , random_state = 1234)

In [None]:
cd_train = torch.utils.data.DataLoader(CustDat(train , tokenizer) , 
                                batch_size = 16 , 
                                shuffle = True , 
                                num_workers = 1 , 
                                pin_memory = True if torch.cuda.is_available() else False)
cd_val = torch.utils.data.DataLoader(CustDat(val , tokenizer , "val") , 
                                batch_size = 16 , 
                                shuffle = True , 
                                num_workers = 1 , 
                                pin_memory = True if torch.cuda.is_available() else False)
cd_test = torch.utils.data.DataLoader(CustDat(test , tokenizer , "test") , 
                                batch_size = 16 , 
                                shuffle = False , 
                                num_workers = 1 , 
                                pin_memory = True if torch.cuda.is_available() else False)

In [None]:
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier , self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased' , return_dict = True)
        self.linear = nn.Sequential(nn.Linear(768 , 500) , 
                                    nn.ReLU(inplace = True) , 
                                    nn.Dropout(0.2) , 
                                    nn.Linear(500 , 400) , 
                                    nn.Dropout(0.2) , 
                                    nn.ReLU(inplace = True) , 
                                    nn.Linear(400 , 200) , 
                                    nn.Dropout(0.2) , 
                                    nn.ReLU(inplace = True) , 
                                    nn.Linear(200 , 100) , 
                                    nn.Dropout(0.2) , 
                                    nn.ReLU(inplace = True)) 
        
    def forward(self , input_id , mask , token_ids):
        output = self.bert(input_ids = input_id , attention_mask = mask , token_type_ids = token_ids)
        linear_output = self.linear(output.pooler_output)
        return linear_output

In [None]:
class clf_layer(nn.Module):
    def __init__(self):
        super(clf_layer , self).__init__()
        self.linear = nn.Linear(100 , 1)
        self.sig = nn.Sigmoid()
    def forward(self , x):
        out = self.linear(x)
        out = self.sig(out)
        return out

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model = BertClassifier().to(device)
clf = clf_layer().to(device)

In [None]:
criterion = nn.BCELoss()
optimizer1 = torch.optim.Adam(model.parameters() , lr = 0.000001)
optimizer2 = torch.optim.Adam(clf.linear.parameters() , lr = 0.000001)

In [None]:
checkpoints = torch.load("../input/let-s-see/xg3.pth.tar")
model.load_state_dict(checkpoints["model_state"])
optimizer1.load_state_dict(checkpoints["optimizer1_state"])
optimizer2.load_state_dict(checkpoints["optimizer2_state"])

In [None]:
'''num_epochs = 100
losses = []
val_losses = []
accu_list = []
val_accu_list = []
for epoch in range(num_epochs):
    ls = 0
    accu = 0
    tot_labs = 0
    model.train()
    for (input_id , mask , token_ids , targets) in cd_train:
        if torch.cuda.is_available():
            input_id = input_id.cuda()
            mask = mask.cuda()
            token_ids = token_ids.cuda()
            targets = targets.cuda()
        out = model(input_id.squeeze() , mask.squeeze() , token_ids.squeeze())
        out = clf(out)
        loss = criterion(torch.squeeze(out) , targets.float())
        ls += loss.cpu().detach().numpy()
        accu += (torch.squeeze(out).round() == targets).sum().cpu().detach().numpy()
        tot_labs += targets.shape[0]
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        loss.backward()
        optimizer1.step()
        optimizer2.step()
    losses.append(ls)
    accu_list.append(accu / tot_labs)
    
    with torch.no_grad():
        model.eval()
        val_accu = 0
        tot_val_labs = 0
        ls_val = 0
        for (input_id , mask , token_ids , targets) in cd_val:
            if torch.cuda.is_available():
                input_id = input_id.cuda()
                mask = mask.cuda()
                token_ids = token_ids.cuda()
                targets = targets.cuda()
            out = model(input_id.squeeze() , mask.squeeze() , token_ids.squeeze())
            out = clf(out)
            loss = criterion(out.squeeze() , targets.float())
            ls_val += loss.cpu().detach().numpy()
            val_accu += (torch.squeeze(out).round() == targets).sum().cpu().detach().numpy()
            tot_val_labs += targets.shape[0]
        val_accu_list.append(val_accu / tot_val_labs)
        val_losses.append(ls_val)
    print("done epoch " , epoch)'''

In [None]:
#plt.plot(losses)

In [None]:
#plt.plot(accu_list)

In [None]:
#plt.plot(val_accu_list)

In [None]:
#plt.plot(val_losses)

In [None]:
train.head()

In [None]:
train_data = None
train_target_data = []
with torch.no_grad():
    model.eval()
    for (input_id , mask , token_ids , targets) in cd_train:
        if torch.cuda.is_available():
            input_id = input_id.cuda()
            mask = mask.cuda()
            token_ids = token_ids.cuda()
        out = model(input_id.squeeze() , mask.squeeze() , token_ids.squeeze())
        if train_data is None:
            train_data = out
        else:
            train_data = torch.cat([train_data , out] , dim = 0)
        train_target_data.extend(targets.numpy())

In [None]:
val_data = None
val_target_data = []
with torch.no_grad():
    model.eval()
    for (input_id , mask , token_ids , targets) in cd_val:
        if torch.cuda.is_available():
            input_id = input_id.cuda()
            mask = mask.cuda()
            token_ids = token_ids.cuda()
        out = model(input_id.squeeze() , mask.squeeze() , token_ids.squeeze())
        if val_data is None:
            val_data = out
        else:
            val_data = torch.cat([val_data , out] , dim = 0)
        val_target_data.extend(targets.numpy())

In [None]:
train_data = train_data.cpu().detach().numpy()

In [None]:
val_data = val_data.cpu().detach().numpy()

In [None]:
xg_model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)

In [None]:
xg_model.fit(train_data , train_target_data)

In [None]:
xg_model.score(val_data , val_target_data)

In [None]:
checkpoints = {"model_state" : model.state_dict() ,
               "optimizer1_state" : optimizer1.state_dict() , 
               "optimizer2_state" : optimizer2.state_dict()}
#torch.save(checkpoints , "xg3.pth.tar")

In [None]:
import os
#os.remove("./xg2.pth.tar")
#os.remove("./submission.csv")

In [None]:
#test.head()

In [None]:
id_list = []
out_list = []
with torch.no_grad():
    model.eval()
    for (input_id , mask , token_ids , idd) in cd_test:
        if torch.cuda.is_available():
            input_id = input_id.cuda()
            mask = mask.cuda()
            token_ids = token_ids.cuda()
        out = model(input_id.squeeze() , mask.squeeze() , token_ids.squeeze())
        out = xg_model.predict(out.cpu().numpy())
        #print(out.round().cpu().numpy().squeeze().astype("int"))
        #print(idd.numpy())
        id_list.extend(idd.numpy())
        out_list.extend(out)

In [None]:
#pd.read_csv(r"../input/nlp-getting-started/sample_submission.csv").head()

In [None]:
sub_df = pd.DataFrame({
    "id" : id_list , 
    "target" : out_list
})

In [None]:
sub_df.to_csv("submission.csv" , index = False)

In [None]:
pd.read_csv("./submission.csv").head()

In [None]:
Counter(out_list)