# Imports

In [1]:
from google.colab import files
import pandas as pd

# Getting competition data

In [2]:
# Installing Kaggle API
!pip install --upgrade --force-reinstall --no-deps kaggle

Collecting kaggle
  Downloading kaggle-1.6.6.tar.gz (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m51.2/84.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.6/84.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.6.6-py3-none-any.whl size=111943 sha256=90829d5bb31e96ef6d49134e4643ec6ab5ee351d89cca74829f027588993bc80
  Stored in directory: /root/.cache/pip/wheels/53/34/8c/8ca3450d17206d9e37e1ee3aeb47cbb2873d22a9e0c60eb137
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.16
  

In [3]:
# Uploading personal API Token without leaking it
files.upload();

Saving kaggle.json to kaggle.json


In [4]:
# Moving personal API Token and securing it
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Getting competition data
!kaggle competitions download -c nlp-getting-started

# Unzipping competition data
!unzip -n nlp-getting-started.zip -d nlp-getting-started

# Removing .zip downloaded file
!rm nlp-getting-started.zip

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 93.4MB/s]
Archive:  nlp-getting-started.zip
  inflating: nlp-getting-started/sample_submission.csv  
  inflating: nlp-getting-started/test.csv  
  inflating: nlp-getting-started/train.csv  


In [5]:
# Showing competition data
!ls nlp-getting-started/

# Creating result folder
!mkdir nlp-getting-started/results

sample_submission.csv  test.csv  train.csv


# Loading and exploring data

In [6]:
# Reading data
df_train = pd.read_csv('nlp-getting-started/train.csv')
df_test = pd.read_csv('nlp-getting-started/test.csv')
submission_file_example = pd.read_csv('nlp-getting-started/sample_submission.csv')

## Shape and format

In [7]:
# Show train df shape and column types
print(df_train.shape)
df_train.dtypes

(7613, 5)


id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

In [8]:
# Show test df shape and column types
print(df_test.shape)
df_test.dtypes

(3263, 4)


id           int64
keyword     object
location    object
text        object
dtype: object

# Data example

In [9]:
# Prevent pandas from truncating text columns
pd.set_option('display.max_colwidth', None)

# Show first 3 samples of train df
df_train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1


In [10]:
# Show first 3 samples of test df
df_test.head(3)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"


In [11]:
# Show submission file format
submission_file_example.head(3)

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0


# Data curation

## Duplicates

In [12]:
# Looking for duplicated samples
print(df_train[df_train.duplicated()].shape)
print(df_test[df_test.duplicated()].shape)

(0, 5)
(0, 4)


In [13]:
# Looking for duplicated ids
print(df_train[df_train.duplicated(['id'])].shape)
print(df_test[df_test.duplicated(['id'])].shape)

(0, 5)
(0, 4)


## Nulls

In [14]:
# Nulls values per column in train df
df_train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [15]:
# Nulls values per column in test df
df_test.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

Provided that I am going to work only with id and text columns, I'll remove keyword and location columns.

In [16]:
# Removing keyword and location columns
df_train = df_train.drop(['keyword', 'location'], axis=1)
df_test = df_test.drop(['keyword', 'location'], axis=1)

## Categories' balance

In [17]:
# Categories' sampler number
print(df_train['target'].value_counts())

# Categories' sampler proportion
print(df_train['target'].value_counts()/len(df_train))

0    4342
1    3271
Name: target, dtype: int64
0    0.57034
1    0.42966
Name: target, dtype: float64


The categories are slightly imbalanced. That imbalance could be solved by dropping nearly 1000 samples of 'not disaster' category. Nevertheless, for the time being, I won't delete that samples and I will keep the data as it is.

# Text processing

## Text cleaning

In [18]:
# Convert the tweets to lowercase
df_train["text_cleaned"] = df_train["text"].str.lower()

# Removing mentions and links
df_train["text_cleaned"] = df_train["text"].str.replace(r'@\w+|(http|https)://\S+|(#\w+)', '', regex=True)

# Removing unnecessary whitespaces
df_train["text_cleaned"] = df_train["text_cleaned"].str.strip()

In [19]:
# Show 10 random samples to see how text cleaning is going
df_train.sample(10)

Unnamed: 0,id,text,target,text_cleaned
940,1361,Vanessa's game has officially blown up. LADIES AND GENTLEMEN...the real show is about to begin. #BB17,0,Vanessa's game has officially blown up. LADIES AND GENTLEMEN...the real show is about to begin.
2744,3947,Obama Declares Disaster for Typhoon-Devastated Saipan: Obama signs disaster declaration for Northern Marians a... http://t.co/XDt4VHFn7B,1,Obama Declares Disaster for Typhoon-Devastated Saipan: Obama signs disaster declaration for Northern Marians a...
388,561,The Sound of Arson,0,The Sound of Arson
4743,6746,I wish that the earth sea and sky up above\nwould send me someone to lava????,0,I wish that the earth sea and sky up above\nwould send me someone to lava????
3342,4786,ALERT! Sandy Hook Elementary School Evacuated After Û÷Bomb ThreatÛª http://t.co/LwLexXjUS8,1,ALERT! Sandy Hook Elementary School Evacuated After Û÷Bomb ThreatÛª
1982,2853,@TheLegendBlue @Cozmo23 they'll probably allow us to ascend them but not get them to the damage max values,0,they'll probably allow us to ascend them but not get them to the damage max values
1728,2491,2 Cars Collide 1 Crashes Into Building: Two cars collided at an intersection and that sent one vehicle crashingÛ_ http://t.co/TpUu3eaTB3,1,2 Cars Collide 1 Crashes Into Building: Two cars collided at an intersection and that sent one vehicle crashingÛ_
2470,3540,#ModiMinistry Railway Minister Prabhu calls MP derailment a natural calamity http://t.co/tL41olpAkZ,1,Railway Minister Prabhu calls MP derailment a natural calamity
2396,3447,im tired of all these #AllLivesMatter people. they only say this to derail #blacklivesmatter they dont do anything for 'all lives' lmfao,0,im tired of all these people. they only say this to derail they dont do anything for 'all lives' lmfao
806,1170,@anellatulip and put the taint there and that all that the magisters did was to open the gates and let the blight get away from it,0,and put the taint there and that all that the magisters did was to open the gates and let the blight get away from it
