In [1]:
import numpy as np 

import pandas as pd 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

import logging
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

from transformers import *

from nltk.tokenize import word_tokenize

import os
import re
import string
import random
import time

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/train.csv


In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [3]:
SEED = 2020
seed_everything(SEED)

In [4]:
TRAIN_FILE_PATH = '/kaggle/input/nlp-getting-started/train.csv'
TEST_FILE_PATH = '/kaggle/input/nlp-getting-started/test.csv'
SUBMISSION_FILE_PATH = '/kaggle/input/nlp-getting-started/sample_submission.csv'

In [5]:
train_df = pd.read_csv(TRAIN_FILE_PATH)
test_df = pd.read_csv(TEST_FILE_PATH)

<h2>Cleaning text data</h2>

In [6]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train_df.at[train_df['id'].isin(ids_with_target_error),'target'] = 0
train_df[train_df['id'].isin(ids_with_target_error)]

Unnamed: 0,id,keyword,location,text,target
229,328,annihilated,,Ready to get annihilated for the BUCS game,0
301,443,apocalypse,,Short Reading\n\nApocalypse 21:1023 \n\nIn the spirit the angel took me to the top of an enormous high mountain and... http://t.co/v8AfTD9zeZ,0
356,513,army,Studio,But if you build an army of 100 dogs and their leader is a lion all dogs will fight like a lion.,0
1822,2619,crashed,,My iPod crashed..... \n#WeLoveYouLouis \n#MTVHottest One Direction,0
2536,3640,desolation,"Quilmes , Arg",This desperation dislocation\nSeparation condemnation\nRevelation in temptation\nIsolation desolation\nLet it go and so to find away,0
2715,3900,devastated,PG Chillin!,Man Currensy really be talkin that talk... I'd be more devastated if he had a ghostwriter than anybody else....,0
3024,4342,dust%20storm,chicago,Going to a fest? Bring swimming goggles for the dust storm in the circle pit,0
4068,5781,forest%20fires,,Campsite recommendations \nToilets /shower \nPub \nFires \nNo kids \nPizza shop \nForest \nPretty stream \nNo midges\nNo snakes\nThanks ??,0
4609,6552,injury,Saint Paul,My prediction for the Vikings game this Sunday....dont expect a whole lot. Infact I think Zimmer goal is....injury free 1st game,0
4611,6554,injury,,Dante Exum's knee injury could stem Jazz's hoped-for surge back to ... http://t.co/8PIFutrB5U,0


In [7]:
def clean_tweets(tweet):
    """Removes links and non-ASCII characters"""
    
    tweet = ''.join([x for x in tweet if x in string.printable])
    
    # Removing URLs
    tweet = re.sub(r"http\S+", "", tweet)
    
    return tweet

In [8]:
train_df["text"] = train_df["text"].apply(lambda x: clean_tweets(x))
test_df["text"] = test_df["text"].apply(lambda x: clean_tweets(x))

In [9]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [10]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [11]:
train_df["text"] = train_df["text"].apply(lambda x: remove_emoji(x))
test_df["text"] = test_df["text"].apply(lambda x: remove_emoji(x))

In [12]:
def remove_punctuations(text):
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    
    for p in punctuations:
        text = text.replace(p, f' {p} ')

    text = text.replace('...', ' ... ')
    
    if '...' not in text:
        text = text.replace('..', ' ... ')
    
    return text

In [13]:
train_df["text"] = train_df["text"].apply(lambda x: remove_punctuations(x))
test_df["text"] = test_df["text"].apply(lambda x: remove_punctuations(x))

In [14]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [15]:
def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

In [16]:
def convert_abbrev_in_text(text):
    tokens = word_tokenize(text)
    tokens = [convert_abbrev(word) for word in tokens]
    text = ' '.join(tokens)
    return text

In [17]:
train_df["text"] = train_df["text"].apply(lambda x: convert_abbrev_in_text(x))
test_df["text"] = test_df["text"].apply(lambda x: convert_abbrev_in_text(x))

In [None]:
#Remove stopwords
from nltk.corpus import stopwords

import nltk
import regex as re

#remove stop words 
stop = stopwords.words('english')
train_df['text']=train_df['text'].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test_df['text']=test_df['text'].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
#Lemmatize
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

#lemmatization
train_df['text'] = train_df['text'].apply(lemmatize_text)
train_df['text']=[' '.join(map(str, l)) for l in train_df['text']]
test_df['text'] = test_df['text'].apply(lemmatize_text)
test_df['text']=[' '.join(map(str, l)) for l in test_df['text']]

#replace numbers with string
train_df['text']=train_df['text'].str.replace('\d+', '')
test_df['text']=test_df['text'].str.replace('\d+', '')

#removing https
train_df['text']=train_df['text'].str.replace('https?://\\S+|www\\.\\S+', '')
train_df['text']=train_df['text'].str.replace('http?://\\S+|www\\.\\S+', '')

test_df['text']=test_df['text'].str.replace('https?://\\S+|www\\.\\S+', '')
test_df['text']=test_df['text'].str.replace('http?://\\S+|www\\.\\S+', '')

#Remove HTML tags


def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)


train_df['text']=train_df['text'].apply(lambda x : remove_html(x))
test_df['text']=test_df['text'].apply(lambda x : remove_html(x))

#removing usernames
train_df['text']=train_df['text'].str.replace('_+\\w*', 'username')
test_df['text']=test_df['text'].str.replace('_+\\w*', 'username')

In [18]:
len(train_df)

7613

<h2>Modelling</h2>

In [19]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, id, text, label=None):
        """Constructs a InputExample.
        Args:
            id: Unique id for the example.
            text: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.id = id
        self.text = text
        self.label = label


class InputFeatures(object):
    def __init__(self, example_id, choices_features, label):
        
        self.example_id = example_id
        _, input_ids, input_mask, segment_ids = choices_features[0]
        self.choices_features = {
            'input_ids': input_ids,
            'input_mask': input_mask,
            'segment_ids': segment_ids
        }
        self.label = label

In [20]:
def read_examples(df, is_training):
    if not is_training:
        df['target'] = np.zeros(len(df), dtype=np.int64)
    examples = []
    for val in df[['id', 'text', 'target']].values:
        examples.append(InputExample(id=val[0], text=val[1], label=val[2]))
    return examples, df

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.

    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [21]:
def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
    
    features = []
    
    for example_index, example in enumerate(examples):
        
        text = tokenizer.tokenize(example.text)
        MAX_TEXT_LEN = max_seq_length - 2 
        text = text[:MAX_TEXT_LEN]

        choices_features = []

        tokens = ["[CLS]"] + text + ["[SEP]"]  
        segment_ids = [0] * (len(text) + 2) 
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        padding_length = max_seq_length - len(input_ids)
        input_ids += ([0] * padding_length)
        input_mask += ([0] * padding_length)
        segment_ids += ([0] * padding_length)
        choices_features.append((tokens, input_ids, input_mask, segment_ids))

        label = example.label
        if example_index < 1 and is_training:
            logger.info("*** Example ***")
            logger.info("idx: {}".format(example_index))
            logger.info("id: {}".format(example.id))
            logger.info("tokens: {}".format(' '.join(tokens).replace('\u2581', '_')))
            logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
            logger.info("input_mask: {}".format(len(input_mask)))
            logger.info("segment_ids: {}".format(len(segment_ids)))
            logger.info("label: {}".format(label))

        features.append(
            InputFeatures(
                example_id=example.id,
                choices_features=choices_features,
                label=label
            )
        )
    return features

In [22]:
def select_field(features, field):
    return [feature.choices_features[field] for feature in features]

In [23]:
def metric(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return acc, f1

In [24]:
# Hyperparameters

MAX_SEQ_LENGTH = 512  
LEARNING_RATE = 1e-5  
NUM_EPOCHS = 3  
BATCH_SIZE = 8  
PATIENCE = 2  
FILE_NAME = 'model' 
NUM_FOLDS = 5

In [25]:
logger = logging.getLogger('mylogger')
logger.setLevel(logging.DEBUG)

timestamp = time.strftime("%Y.%m.%d_%H.%M.%S", time.localtime())

fh = logging.FileHandler('log_model.txt')
fh.setLevel(logging.DEBUG)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

formatter = logging.Formatter('[%(asctime)s][%(levelname)s] ## %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)

logger.addHandler(fh)
logger.addHandler(ch)

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [27]:
def generate_dataloaders(df):
    examples, df = read_examples(df, is_training=True)
    
    labels = df["target"].astype(int).values
    
    features = convert_examples_to_features(examples, tokenizer, MAX_SEQ_LENGTH, True)
    input_ids = torch.tensor(select_field(features, 'input_ids'))
    input_mask = torch.tensor(select_field(features, 'input_mask'))
    segment_ids = torch.tensor(select_field(features, 'segment_ids'))
    label = torch.tensor([f.label for f in features])
    
    dataset = torch.utils.data.TensorDataset(input_ids, input_mask, segment_ids, label)
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    return data_loader, labels

In [28]:
class NeuralNet(nn.Module):
    def __init__(self, hidden_size=768, num_classes=2):
        super(NeuralNet, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True, output_attentions=True)

        for param in self.bert.parameters():
            param.requires_grad = True
        
        self.drop_out = nn.Dropout() # dropout layer to prevent overfitting
        self.fc = nn.Linear(hidden_size, num_classes) # fully connected layer
        
    def forward(self, input_ids, input_mask, segment_ids):
        last_hidden_state, pooler_output, all_hidden_states, all_attentions = self.bert(input_ids, token_type_ids = segment_ids, attention_mask = input_mask)
        last_hidden_state = last_hidden_state[:, 0,:]                                                       
        
        # Linear layer expects a tensor of size [batch size, input size]
        out = self.drop_out(last_hidden_state) 
        out = self.fc(out) 
        return F.log_softmax(out)

In [29]:
models = []

In [30]:
skf = StratifiedKFold(NUM_FOLDS, shuffle=True, random_state=SEED)
nfold = 1

for train_index, valid_index in skf.split(train_df["text"], train_df["target"]):
    train = train_df.iloc[train_index, :]
    valid = train_df.iloc[valid_index, :]
    
    train_loader, train_labels = generate_dataloaders(train)
    valid_loader, valid_labels = generate_dataloaders(valid)
    
    model = NeuralNet()
    model.cuda()
    
    # Training model on 4 fold
    model.train()
    
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    total_step = len(train_loader)
    
    for epoch in range(NUM_EPOCHS):
        train_loss = 0.
        
        # Training loop
        for i, batch in enumerate(train_loader):
            batch = tuple(t.cuda() for t in batch)
            x_ids, x_mask, x_sids, y_truth = batch
            y_pred = model(x_ids, x_mask, x_sids)
            loss = loss_fn(y_pred, y_truth)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item() / len(train_loader)
            
            total = len(y_truth)
            _, predicted = torch.max(y_pred.data, 1)
            correct = (predicted == y_truth).sum().item()
            
            if (i + 1) % 50 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'.format(epoch + 1, NUM_EPOCHS, i + 1, total_step, loss.item(), (correct / total) * 100)) 
        
    # Validating model performance on the remaining fold
    model.eval()
    
    with torch.no_grad():
        correct, total = 0, 0
        
        for i, batch in enumerate(valid_loader):
            batch = tuple(t.cuda() for t in batch)
            x_ids, x_mask, x_sids, y_truth = batch
            y_pred = model(x_ids, x_mask, x_sids)
            _, predicted = torch.max(y_pred.data, 1)
            total += len(valid_labels)
            correct += (predicted == y_truth).sum().item()
            
        print('Validation performance on {}-th fold: {}'.format(nfold, (correct / total) * 100))
    
    nfold += 1
    models.append(model)

[2020-03-03 19:00:17,864][INFO] ## *** Example ***
[2020-03-03 19:00:17,864][INFO] ## idx: 0
[2020-03-03 19:00:17,866][INFO] ## id: 1
[2020-03-03 19:00:17,868][INFO] ## tokens: [CLS] our deeds are the reason of this # earthquake may allah forgive us all [SEP]
[2020-03-03 19:00:17,869][INFO] ## input_ids: 101 2256 15616 2024 1996 3114 1997 2023 1001 8372 2089 16455 9641 2149 2035 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Epoch [1/3], Step [50/762], Loss: 0.4895, Accuracy: 87.50%
Epoch [1/3], Step [100/762], Loss: 0.6353, Accuracy: 62.50%
Epoch [1/3], Step [150/762], Loss: 1.0194, Accuracy: 62.50%
Epoch [1/3], Step [200/762], Loss: 0.7062, Accuracy: 62.50%
Epoch [1/3], Step [250/762], Loss: 0.2952, Accuracy: 87.50%
Epoch [1/3], Step [300/762], Loss: 0.3858, Accuracy: 87.50%
Epoch [1/3], Step [350/762], Loss: 0.3723, Accuracy: 87.50%
Epoch [1/3], Step [400/762], Loss: 0.3550, Accuracy: 75.00%
Epoch [1/3], Step [450/762], Loss: 0.5191, Accuracy: 87.50%
Epoch [1/3], Step [500/762], Loss: 0.3180, Accuracy: 87.50%
Epoch [1/3], Step [550/762], Loss: 0.7056, Accuracy: 75.00%
Epoch [1/3], Step [600/762], Loss: 0.1736, Accuracy: 87.50%
Epoch [1/3], Step [650/762], Loss: 0.6912, Accuracy: 75.00%
Epoch [1/3], Step [700/762], Loss: 0.9027, Accuracy: 62.50%
Epoch [1/3], Step [750/762], Loss: 0.2091, Accuracy: 87.50%
Epoch [2/3], Step [50/762], Loss: 0.0922, Accuracy: 100.00%
Epoch [2/3], Step [100/762], Loss: 0.3355

[2020-03-03 19:19:02,074][INFO] ## *** Example ***
[2020-03-03 19:19:02,075][INFO] ## idx: 0
[2020-03-03 19:19:02,077][INFO] ## id: 1
[2020-03-03 19:19:02,078][INFO] ## tokens: [CLS] our deeds are the reason of this # earthquake may allah forgive us all [SEP]
[2020-03-03 19:19:02,081][INFO] ## input_ids: 101 2256 15616 2024 1996 3114 1997 2023 1001 8372 2089 16455 9641 2149 2035 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Validation performance on 1-th fold: 0.44414956702292596


[2020-03-03 19:19:07,491][INFO] ## *** Example ***
[2020-03-03 19:19:07,492][INFO] ## idx: 0
[2020-03-03 19:19:07,493][INFO] ## id: 4
[2020-03-03 19:19:07,496][INFO] ## tokens: [CLS] forest fire near la ron ##ge sas ##k . canada [SEP]
[2020-03-03 19:19:07,498][INFO] ## input_ids: 101 3224 2543 2379 2474 6902 3351 21871 2243 1012 2710 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Epoch [1/3], Step [50/762], Loss: 0.7899, Accuracy: 37.50%
Epoch [1/3], Step [100/762], Loss: 0.6061, Accuracy: 75.00%
Epoch [1/3], Step [150/762], Loss: 0.5039, Accuracy: 75.00%
Epoch [1/3], Step [200/762], Loss: 0.5055, Accuracy: 75.00%
Epoch [1/3], Step [250/762], Loss: 0.5999, Accuracy: 75.00%
Epoch [1/3], Step [300/762], Loss: 0.2817, Accuracy: 100.00%
Epoch [1/3], Step [350/762], Loss: 0.2373, Accuracy: 100.00%
Epoch [1/3], Step [400/762], Loss: 0.5950, Accuracy: 75.00%
Epoch [1/3], Step [450/762], Loss: 0.5043, Accuracy: 75.00%
Epoch [1/3], Step [500/762], Loss: 0.1789, Accuracy: 100.00%
Epoch [1/3], Step [550/762], Loss: 0.1630, Accuracy: 100.00%
Epoch [1/3], Step [600/762], Loss: 0.1074, Accuracy: 100.00%
Epoch [1/3], Step [650/762], Loss: 0.4853, Accuracy: 75.00%
Epoch [1/3], Step [700/762], Loss: 0.2366, Accuracy: 100.00%
Epoch [1/3], Step [750/762], Loss: 0.2839, Accuracy: 87.50%
Epoch [2/3], Step [50/762], Loss: 0.1839, Accuracy: 87.50%
Epoch [2/3], Step [100/762], Loss: 0

[2020-03-03 19:37:26,261][INFO] ## *** Example ***
[2020-03-03 19:37:26,263][INFO] ## idx: 0
[2020-03-03 19:37:26,264][INFO] ## id: 1
[2020-03-03 19:37:26,268][INFO] ## tokens: [CLS] our deeds are the reason of this # earthquake may allah forgive us all [SEP]
[2020-03-03 19:37:26,270][INFO] ## input_ids: 101 2256 15616 2024 1996 3114 1997 2023 1001 8372 2089 16455 9641 2149 2035 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Validation performance on 2-th fold: 0.43658664869900615


[2020-03-03 19:37:32,051][INFO] ## *** Example ***
[2020-03-03 19:37:32,052][INFO] ## idx: 0
[2020-03-03 19:37:32,055][INFO] ## id: 6
[2020-03-03 19:37:32,056][INFO] ## tokens: [CLS] 13 , 000 people receive # wild ##fires evacuation orders in california [SEP]
[2020-03-03 19:37:32,058][INFO] ## input_ids: 101 2410 1010 2199 2111 4374 1001 3748 26332 13982 4449 1999 2662 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Epoch [1/3], Step [50/762], Loss: 0.6573, Accuracy: 62.50%
Epoch [1/3], Step [100/762], Loss: 0.3416, Accuracy: 75.00%
Epoch [1/3], Step [150/762], Loss: 0.4128, Accuracy: 87.50%
Epoch [1/3], Step [200/762], Loss: 1.1464, Accuracy: 37.50%
Epoch [1/3], Step [250/762], Loss: 0.1500, Accuracy: 100.00%
Epoch [1/3], Step [300/762], Loss: 0.2779, Accuracy: 100.00%
Epoch [1/3], Step [350/762], Loss: 0.2765, Accuracy: 87.50%
Epoch [1/3], Step [400/762], Loss: 0.1751, Accuracy: 100.00%
Epoch [1/3], Step [450/762], Loss: 0.4837, Accuracy: 75.00%
Epoch [1/3], Step [500/762], Loss: 0.4274, Accuracy: 75.00%
Epoch [1/3], Step [550/762], Loss: 0.5969, Accuracy: 87.50%
Epoch [1/3], Step [600/762], Loss: 0.5390, Accuracy: 75.00%
Epoch [1/3], Step [650/762], Loss: 0.3407, Accuracy: 87.50%
Epoch [1/3], Step [700/762], Loss: 0.2589, Accuracy: 100.00%
Epoch [1/3], Step [750/762], Loss: 0.3670, Accuracy: 87.50%
Epoch [2/3], Step [50/762], Loss: 0.1740, Accuracy: 87.50%
Epoch [2/3], Step [100/762], Loss: 0.0

[2020-03-03 19:55:51,698][INFO] ## *** Example ***
[2020-03-03 19:55:51,699][INFO] ## idx: 0
[2020-03-03 19:55:51,700][INFO] ## id: 4
[2020-03-03 19:55:51,703][INFO] ## tokens: [CLS] forest fire near la ron ##ge sas ##k . canada [SEP]
[2020-03-03 19:55:51,706][INFO] ## input_ids: 101 3224 2543 2379 2474 6902 3351 21871 2243 1012 2710 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Validation performance on 3-th fold: 0.4303988064339809


[2020-03-03 19:55:57,507][INFO] ## *** Example ***
[2020-03-03 19:55:57,508][INFO] ## idx: 0
[2020-03-03 19:55:57,509][INFO] ## id: 1
[2020-03-03 19:55:57,512][INFO] ## tokens: [CLS] our deeds are the reason of this # earthquake may allah forgive us all [SEP]
[2020-03-03 19:55:57,514][INFO] ## input_ids: 101 2256 15616 2024 1996 3114 1997 2023 1001 8372 2089 16455 9641 2149 2035 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Epoch [1/3], Step [50/762], Loss: 0.8951, Accuracy: 37.50%
Epoch [1/3], Step [100/762], Loss: 0.2706, Accuracy: 100.00%
Epoch [1/3], Step [150/762], Loss: 0.3589, Accuracy: 87.50%
Epoch [1/3], Step [200/762], Loss: 0.2700, Accuracy: 87.50%
Epoch [1/3], Step [250/762], Loss: 0.6355, Accuracy: 75.00%
Epoch [1/3], Step [300/762], Loss: 0.1280, Accuracy: 100.00%
Epoch [1/3], Step [350/762], Loss: 0.5207, Accuracy: 75.00%
Epoch [1/3], Step [400/762], Loss: 0.4226, Accuracy: 75.00%
Epoch [1/3], Step [450/762], Loss: 0.2924, Accuracy: 100.00%
Epoch [1/3], Step [500/762], Loss: 0.2254, Accuracy: 100.00%
Epoch [1/3], Step [550/762], Loss: 0.4133, Accuracy: 87.50%
Epoch [1/3], Step [600/762], Loss: 0.5042, Accuracy: 87.50%
Epoch [1/3], Step [650/762], Loss: 0.4459, Accuracy: 87.50%
Epoch [1/3], Step [700/762], Loss: 0.1088, Accuracy: 100.00%
Epoch [1/3], Step [750/762], Loss: 0.7340, Accuracy: 75.00%
Epoch [2/3], Step [50/762], Loss: 0.1631, Accuracy: 100.00%
Epoch [2/3], Step [100/762], Loss: 0

KeyboardInterrupt: 

<h2>Inference</h2>

In [None]:
test_examples, test_df = read_examples(test_df, is_training=False)
test_features = convert_examples_to_features(test_examples, tokenizer, MAX_SEQ_LENGTH, True)
test_input_ids = torch.tensor(select_field(test_features, 'input_ids'), dtype=torch.long)
test_input_mask = torch.tensor(select_field(test_features, 'input_mask'), dtype=torch.long)
test_segment_ids = torch.tensor(select_field(test_features, 'segment_ids'), dtype=torch.long)

test = torch.utils.data.TensorDataset(test_input_ids, test_input_mask, test_segment_ids)

test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Predicting with 5 models

preds = []

for i, model in enumerate(models):
    
    test_preds = []
    
    model.eval()

    with torch.no_grad():
            for i, batch in enumerate(test_loader):
                batch = tuple(t.cuda() for t in batch)
                x_ids, x_mask, x_sids = batch
                y_pred = model(x_ids, x_mask, x_sids).detach()
                test_preds[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] = F.softmax(y_pred, dim=1).cpu().numpy()  
    
    print("MODEL {}: inference done".format(i))
    
    preds.append(test_preds)

In [None]:
mean = np.mean(preds, axis=0)

In [None]:
# Averaging predictions
final_preds = np.argmax(mean, axis=1)

In [None]:
#sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
#sample['target'] = final_preds
#sample.to_csv('submission.csv', index=False)

In [None]:
torch.save(model.state_dict(), '/kaggle/output/working')