In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Uncomment for first use
# nltk.download('punkt')
# nltk.download('stopwords')

In [2]:
df=pd.read_csv('../data/train.csv')

**Taget : Identify wheter the tweet is about a real disaster or not**

# Basic Data Overview

In [3]:
df.head(50)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
df.shape

(7613, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
df['keyword'].nunique()

221

In [7]:
df['location'].nunique()

3341

**First observations**
* We have 5 columns and 7613 rows
* We have some missing values in location and keyword column
* We have 221 unique keywords and more than 3000 unique locations
* Locations are in different format in some rows

# EDA

**1. TEXT CLEANING**
* convert to lower case
* delete punctuation 
* remove https links
* remove stopwords (a, the, an ...)
+ ...

In [8]:
cleaned_text_df = df.copy()
cleaned_text_df['text'] = cleaned_text_df['text'].apply(lambda x : " ".join(word.lower() for word in x.split()))  #to lower case
cleaned_text_df['text'] = cleaned_text_df['text'].apply(lambda line: re.sub(r'[^\w\s]', repl='', string=line)) #removing punctuation
cleaned_text_df['text']


0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13000 people receive wildfires evacuation orde...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    aria_ahrary thetawniest the out of control wil...
7610    m194 0104 utc5km s of volcano hawaii httptcozd...
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

In [9]:
# Removing https links 
def remove_links(text):
    # Define the pattern for detecting URLs
    url_pattern = r'https\S+'
    
    # Replace URLs with an empty string
    text_without_links = re.sub(url_pattern, '', text)
    
    return text_without_links

cleaned_text_df['text']=cleaned_text_df['text'].apply(remove_links)

In [13]:
# Tokenizing the whole sentence
def tokenize_sentence(sentence):
    return word_tokenize(sentence)

# Applying tokenization to our dataset
cleaned_text_df['tokenized_text'] = cleaned_text_df['text'].apply(tokenize_sentence)

# Assigning all stop words
stop_words = set(stopwords.words('english'))

# Filtering all stop words in each sentence
def filter_stop_words(tokenized_tweet):
    return [word for word in tokenized_tweet if word not in stop_words]

# Applying changes to the 'tokenized_tweet' column
cleaned_text_df['tokenized_text'] = cleaned_text_df['tokenized_text'].apply(filter_stop_words)

# Joining the tokenized words back into sentences
cleaned_text_df['text'] = cleaned_text_df['tokenized_text'].apply(lambda x: ' '.join(x))

cleaned_text_df['tokenized_text']

0       [deeds, reason, earthquake, may, allah, forgiv...
1           [forest, fire, near, la, ronge, sask, canada]
2       [residents, asked, shelter, place, notified, o...
3       [13000, people, receive, wildfires, evacuation...
4       [got, sent, photo, ruby, alaska, smoke, wildfi...
                              ...                        
7608    [two, giant, cranes, holding, bridge, collapse...
7609    [aria_ahrary, thetawniest, control, wild, fire...
7610    [m194, 0104, utc5km, volcano, hawaii, httptcoz...
7611    [police, investigating, ebike, collided, car, ...
7612    [latest, homes, razed, northern, california, w...
Name: tokenized_text, Length: 7613, dtype: object