# Description

# Contents

# Getting Data From Kaggle via API <a name="getting-kaggle-data"></a>

In [1]:
! pip install -q kaggle

In [2]:
import os

os.chdir('/content/drive/MyDrive/Hackerearth/Disaster Tweet')
os.getcwd()

'/content/drive/MyDrive/Hackerearth/Disaster Tweet'

In [None]:
from google.colab import files

files.upload()

In [5]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [9]:
! kaggle datasets download -d vstepanenko/disaster-tweets

Downloading disaster-tweets.zip to /content/drive/My Drive/Hackerearth/Disaster Tweet
  0% 0.00/656k [00:00<?, ?B/s]
100% 656k/656k [00:00<00:00, 44.0MB/s]


In [10]:
! unzip disaster-tweets.zip

Archive:  disaster-tweets.zip
  inflating: tweets.csv              


# Play Around with data <a name="play-with-data"></a>

### Explore Data

In [11]:
WORKING_DIR = '/content/drive/MyDrive/Hackerearth/Disaster Tweet'

import os
os.chdir(WORKING_DIR)

from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

<IPython.core.display.Javascript object>

In [12]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np
import torch 
import torch.nn as nn
import torchtext
import matplotlib.pyplot as plt


In [135]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [136]:
# imbalanced dataset, more negative samples than positive samples

df.groupby('target').count()

Unnamed: 0_level_0,id,keyword,location,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4342,4323,2884,4342
1,3271,3229,2196,3271


In [137]:
len(df['keyword'].unique()), len(df)

(222, 7613)

### Preprocessing using torchtext

In [20]:
from torchtext import data  

In [152]:
import re
import string 

def clean_text(text):
    text = text.lower()
    #foction de replacement
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"she's","she is",text)
    text = re.sub(r"can't","cannot",text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r"[-()\"#/@;:<>{}=~|.?!,]", "", text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [153]:
df['text'] = df['text'].map(clean_text)

df.to_csv('cleaned tweets.csv', index=False)

In [154]:
# Denotes the transformatioin to be applied

Text = data.Field(sequential=True, use_vocab=True, tokenize='spacy', batch_first=True)
Labels = data.LabelField(dtype=torch.float, batch_first=True)

In [155]:
# which cols to consider and how, in the dataset, and which transformation to apply

fields = {'text': ('text', Text), 'target': ('label', Labels)}

In [156]:
# loading custom dataset
training_data=data.TabularDataset(path = 'cleaned tweets.csv',format = 'csv',fields = fields)

In [157]:
import random
# for now, random split, will look for ways to stratify the split
train_data, valid_data = training_data.split(split_ratio=0.7)

In [158]:
# build vocabulary

Text.build_vocab(train_data,min_freq=3)  
Labels.build_vocab(train_data)

In [159]:
# No. of unique tokens in text
print("Size of TEXT vocabulary:",len(Text.vocab))

# No. of unique tokens in label
print("Size of LABEL vocabulary:",len(Labels.vocab))

# Commonly used words
print(Text.vocab.freqs.most_common(10))  

# Word dictionary
# print(Text.vocab.stoi) 

# Data cleaning required !!

Size of TEXT vocabulary: 3358
Size of LABEL vocabulary: 2
[(' ', 3525), ('the', 2321), ('a', 1498), ('in', 1377), ('to', 1367), ('of', 1275), ('i', 1217), ('and', 991), ('is', 683), ('for', 646)]


In [160]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 128

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [161]:

for batch in train_iterator:
    print(batch.text.shape)
    # [batch_size, max_length_of_sentence_in_batch]
    break

torch.Size([128, 16])


## Training Starts

In [162]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch
import torch.nn as nn

### Define model

In [163]:
class Rnn(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, batch_size, n_layers=1):
        super(Rnn, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=0.2, batch_first=True)
        self.norm = nn.BatchNorm1d(hidden_dim)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, 2)
        
    def forward(self, src):
        
        #src = [src len, batch size] or [batch size, seq len]
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]

        hidden = torch.squeeze(hidden, 0)
        x = self.relu(self.norm(hidden))
        return self.fc(self.dropout(x))

In [164]:
model = Rnn(input_dim=len(Text.vocab), embed_dim=100, hidden_dim=50, batch_size=BATCH_SIZE)
model = model.to(device)

In [165]:
optimizer = torch.optim.Adam(model.parameters())
criterian = nn.CrossEntropyLoss().to(device)
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, patience=3, threshold=0.01, verbose=True)

### Train

In [166]:
from tqdm.autonotebook import tqdm

for epoch in range(40):
    running_loss = 0.0
    running_accuracy = 0.0

    for batch in tqdm(train_iterator, leave=False, position=0):
        inputs, labels = batch.text, batch.label.long()
        out = model(batch.text)
        # print('Debug: ', out.shape, labels.shape, out[0])

        loss = criterian(out, labels)
        loss.backward()

        optimizer.step()

        out = torch.argmax(out, 1)
        running_loss += loss.item() * inputs.shape[0]
        running_accuracy += torch.sum(labels == out)

    scheduler.step(running_loss)
    running_accuracy /= len(train_data)
    running_loss /= len(train_data)

    print(f'\tEpoch: [{epoch+1}] Running Loss: {running_loss :.2f}, Accuracy: {running_accuracy : .2f}')

HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [1] Running Loss: 0.70, Accuracy:  0.55


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [2] Running Loss: 0.66, Accuracy:  0.60


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [3] Running Loss: 0.59, Accuracy:  0.71


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [4] Running Loss: 0.50, Accuracy:  0.76


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [5] Running Loss: 0.47, Accuracy:  0.78


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [6] Running Loss: 0.39, Accuracy:  0.84


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [7] Running Loss: 0.34, Accuracy:  0.86


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [8] Running Loss: 0.29, Accuracy:  0.88


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [9] Running Loss: 0.27, Accuracy:  0.89


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [10] Running Loss: 0.26, Accuracy:  0.90


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [11] Running Loss: 0.23, Accuracy:  0.91


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [12] Running Loss: 0.23, Accuracy:  0.91


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [13] Running Loss: 0.23, Accuracy:  0.91


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [14] Running Loss: 0.22, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [15] Running Loss: 0.19, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [16] Running Loss: 0.19, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [17] Running Loss: 0.19, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [18] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [19] Running Loss: 0.19, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [20] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [21] Running Loss: 0.20, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [22] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [23] Running Loss: 0.21, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [24] Running Loss: 0.21, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [25] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

Epoch    26: reducing learning rate of group 0 to 1.0000e-04.
	Epoch: [26] Running Loss: 0.23, Accuracy:  0.91


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [27] Running Loss: 0.23, Accuracy:  0.91


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [28] Running Loss: 0.21, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [29] Running Loss: 0.20, Accuracy:  0.92


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

Epoch    30: reducing learning rate of group 0 to 1.0000e-05.
	Epoch: [30] Running Loss: 0.19, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [31] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [32] Running Loss: 0.19, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [33] Running Loss: 0.19, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [34] Running Loss: 0.17, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [35] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [36] Running Loss: 0.17, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [37] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

Epoch    38: reducing learning rate of group 0 to 1.0000e-06.
	Epoch: [38] Running Loss: 0.18, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [39] Running Loss: 0.17, Accuracy:  0.93


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))

	Epoch: [40] Running Loss: 0.17, Accuracy:  0.93


In [170]:
list(df['text'])[40:100]

['check these out     nsfw',
 'on the outside youre ablaze and alivebut youre dead inside',
 'had an awesome time visiting the cfc head office the ancop site and ablaze thanks to tita vida for taking care of us ',
 'soooo pumped for ablaze  southridgelife',
 'i wanted to set chicago ablaze with my preaching but not my hotel ',
 'i gained  followers in the last week you know your stats and grow with ',
 'how the west was burned thousands of wildfires ablaze in california alone ',
 'building the perfect tracklist to life leave the streets ablaze',
 'check these out     nsfw',
 'first night with retainers in its quite weird better get used to it i have to wear them every single night for the next year at least',
 'deputies man shot before brighton home set ablaze ',
 'man wife get six years jail for setting ablaze niece',
 'santa cruz \x89ûó head of the st elizabeth police superintendent lanford salmon has r    ',
 'police arsonist deliberately set black church in north carolinaåêablaze '

### Improvements

- clean the data, remove [. , # ] etc
- split the dataset such that train and test labels have same positive and negative

In [168]:
! unzip nlp-getting-started.zip

Archive:  nlp-getting-started.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 