# Imports

In [1]:
# Get and load data
from google.colab import files
import pandas as pd

# Clean and process text
from bs4 import BeautifulSoup
import re

# Getting competition data

In [2]:
# Installing Kaggle API
!pip install --upgrade --force-reinstall --no-deps kaggle

Collecting kaggle
  Downloading kaggle-1.6.12.tar.gz (79 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/79.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m71.7/79.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.7/79.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.6.12-py3-none-any.whl size=102969 sha256=c04ed31ae00e5b8e599254b94573daae5bb2b0ed367d0e5cf1543b8cffd3d93e
  Stored in directory: /root/.cache/pip/wheels/1e/0b/7c/50f8e89c3d2f82838dbd7afeddffbb9357003009ada98216c7
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.16


In [3]:
# Uploading personal API Token without leaking it
files.upload();

Saving kaggle.json to kaggle.json


In [4]:
# Moving personal API Token and securing it
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Getting competition data
!kaggle competitions download -c nlp-getting-started

# Unzipping competition data
!unzip -n nlp-getting-started.zip -d nlp-getting-started

# Removing .zip downloaded file
!rm nlp-getting-started.zip

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 85.1MB/s]
Archive:  nlp-getting-started.zip
  inflating: nlp-getting-started/sample_submission.csv  
  inflating: nlp-getting-started/test.csv  
  inflating: nlp-getting-started/train.csv  


In [5]:
# Showing competition data
!ls nlp-getting-started/

# Creating result folder
!mkdir nlp-getting-started/results

sample_submission.csv  test.csv  train.csv


# Loading and exploring data

In [6]:
# Reading data
df_train = pd.read_csv('nlp-getting-started/train.csv')
df_test = pd.read_csv('nlp-getting-started/test.csv')
submission_file_example = pd.read_csv('nlp-getting-started/sample_submission.csv')

## Shape and format

In [7]:
# Show train df shape and column types
print(df_train.shape)
df_train.dtypes

(7613, 5)


id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

In [8]:
# Show test df shape and column types
print(df_test.shape)
df_test.dtypes

(3263, 4)


id           int64
keyword     object
location    object
text        object
dtype: object

# Data example

In [9]:
# Prevent pandas from truncating text columns
pd.set_option('display.max_colwidth', None)

# Show first 3 samples of train df
df_train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1


In [10]:
# Show first 3 samples of test df
df_test.head(3)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"


In [11]:
# Show submission file format
submission_file_example.head(3)

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0


# Data curation

## Duplicates

In [12]:
# Looking for duplicated samples
print(df_train[df_train.duplicated()].shape)
print(df_test[df_test.duplicated()].shape)

(0, 5)
(0, 4)


In [13]:
# Looking for duplicated ids
print(df_train[df_train.duplicated(['id'])].shape)
print(df_test[df_test.duplicated(['id'])].shape)

(0, 5)
(0, 4)


## Nulls

In [14]:
# Nulls values per column in train df
df_train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [15]:
# Nulls values per column in test df
df_test.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

## Whitespace tweets

In [16]:
# Function for looking for whitespace tweets
def look_for_whitespace_tweets(df):
  blank_tweets_idx = []

  # Iterate over df
  for idx, row in df_train.iterrows():
    # Check if the tweet is a string
    if type(row['text']) == str:
      # Check if the tweet is a whitespace tweet
      if row['text'].isspace():
        blank_tweets_idx.append(idx)

  # Return the list with the indexes of whitespace tweets
  return blank_tweets_idx

In [17]:
# Detect tweets which are only whitespace
whitespace_tweets_train = look_for_whitespace_tweets(df_train)
whitespace_tweets_test = look_for_whitespace_tweets(df_test)

# Show train and test whitespace tweets
print('Train whitespace tweets: ', whitespace_tweets_train)
print('Test whitespace tweets: ', whitespace_tweets_test)

Train whitespace tweets:  []
Test whitespace tweets:  []


Provided that I am going to work only with id and text columns, I'll remove keyword and location columns.

In [18]:
# Removing keyword and location columns
df_train = df_train.drop(['keyword', 'location'], axis=1)
df_test = df_test.drop(['keyword', 'location'], axis=1)

## Categories' balance

In [19]:
# Categories' sampler number
print(df_train['target'].value_counts())

# Categories' sampler proportion
print(df_train['target'].value_counts()/len(df_train))

target
0    4342
1    3271
Name: count, dtype: int64
target
0    0.57034
1    0.42966
Name: count, dtype: float64


The categories are slightly imbalanced. That imbalance could be solved by dropping nearly 1000 samples of 'not disaster' category. Nevertheless, for the time being, I won't delete that samples and I will keep the data as it is.

# Text cleaning

##HTML remnants
If the tweets come from Twitter API they may contain remnants of HTML that should be removed before performing other text cleaning and processing tasks.

In [20]:
# Function to remove HTML tags from tweets
def remove_html_tags(tweet):
  return BeautifulSoup(tweet, 'html.parser').get_text()

In [21]:
# Remove HTML tags from tweets
df_train['text_cleaned'] = df_train['text'].apply(remove_html_tags)

  return BeautifulSoup(tweet, 'html.parser').get_text()


##Mentions
I will remove mentions because I don't consider that they would add value.

In [22]:
# Function to remove mentions from tweets
def remove_mentions(tweet):
  return re.sub(r'@[\w]+', '', tweet)

In [23]:
# Remove mentions from tweets
df_train['text_cleaned'] = df_train['text_cleaned'].apply(remove_mentions)

##Links
I will remove also links.

In [24]:
# Function to remove URLs from tweets
def remove_urls(tweet):
  return re.sub(r'https?://\S+', '', tweet)

In [25]:
# Remove URLs from tweets
df_train['text_cleaned'] = df_train['text_cleaned'].apply(remove_urls)

##Hashtags
Hashtags may contain crucial information. Thus, I would keep them but without # symbol.

In [26]:
# Function to extract hashtags from tweets
def extract_hashtags(tweet):
  return re.sub(r'#(\w+)', r'\1', tweet)

In [27]:
# Extract hashtags from tweets
df_train['text_cleaned'] = df_train['text_cleaned'].apply(extract_hashtags)

## Special characters
Now that I have kept the hashtag info I don't care about puntuaction and special characters so I'll remove them.

In [28]:
# Function to remove special characters and puntuaction from tweets
def remove_special_characters(tweet):
  return re.sub(r'[^a-zA-Z0-9\s]', '', tweet)

In [29]:
# Remove special characters and puntuaction from tweets
df_train['text_cleaned'] = df_train['text_cleaned'].apply(remove_special_characters)

In [30]:
df_train

Unnamed: 0,id,text,target,text_cleaned
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,Our Deeds are the Reason of this earthquake May ALLAH Forgive us all
1,4,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada
2,5,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,All residents asked to shelter in place are being notified by officers No other evacuation or shelter in place orders are expected
3,6,"13,000 people receive #wildfires evacuation orders in California",1,13000 people receive wildfires evacuation orders in California
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,Just got sent this photo from Ruby Alaska as smoke from wildfires pours into a school
...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1,Two giant cranes holding a bridge collapse into nearby homes
7609,10870,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1,The out of control wild fires in California even in the Northern part of the state Very troubling
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1,M194 0104 UTC5km S of Volcano Hawaii
7611,10872,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1,Police investigating after an ebike collided with a car in Little Portugal Ebike rider suffered serious nonlife threatening injuries


## Numbers

##Lowercase

##Blankspaces

#Text processing

##Tokenization

##Spelling

##Stopwords

##Lemmatization

##Stemming