# Data Exploration

In [1]:
import os
import pandas as pd

In [2]:
DATA_PATH = os.path.join(os.getcwd(), "data")

In [3]:
def load_tweet_data(data_path=DATA_PATH):
    csv_path = os.path.join(data_path, "train.csv")
    return pd.read_csv(csv_path)

In [4]:
tweet_data = load_tweet_data()

In [5]:
tweet_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
tweet_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
tweet_data["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
tweet_data["keyword"].value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [9]:
tweet_data["location"].value_counts()

USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: location, Length: 3341, dtype: int64

In [10]:
tweet_data.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [11]:
tweet_data["keyword"].nunique()

221

In [12]:
tweet_data["location"].nunique()

3341

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=84)
for train_index, test_index in split.split(tweet_data, tweet_data["target"]):
    train_set = tweet_data.loc[train_index]
    test_set = tweet_data.loc[test_index]

In [14]:
print(len(train_set))
print(len(test_set))

6090
1523


In [27]:
train_tweet = train_set.copy()

In [32]:
train_tweet = train_tweet.drop('text', axis=1)

In [33]:
train_tweet_cat = train_tweet[["keyword", "location"]]
train_tweet_cat.head(10)

Unnamed: 0,keyword,location
3820,first%20responders,"Nashville, TN"
3244,engulfed,
3896,flattened,
3624,fatalities,"Chamblee, Georgia"
5461,police,Houston
1855,crush,
1482,catastrophe,
1255,buildings%20on%20fire,"Roanoke, VA"
3215,emergency%20services,Auckland
2943,drowned,


In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
full_pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(), ["keyword", "location"])
], remainder="passthrough")

train_tweet_prepared = full_pipeline.fit_transform(train_tweet)

In [37]:
train_tweet_prepared

<6090x2991 sparse matrix of type '<class 'numpy.float64'>'
	with 20887 stored elements in Compressed Sparse Row format>

In [39]:
import scipy.sparse
composed_train_set = pd.DataFrame.sparse.from_spmatrix(train_tweet_prepared)

In [40]:
composed_train_set.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2981,2982,2983,2984,2985,2986,2987,2988,2989,2990
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5430.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4661.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5540.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5172.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7789.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2667.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2136.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1811.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4611.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4232.0,0.0


In [42]:
composed_train_set[2990].value_counts()

0.0    3473
1.0    2617
Name: 2990, dtype: int64

In [45]:
corr_matrix = composed_train_set.corr()

In [46]:
corr_matrix[2990].sort_values(ascending=False)

2990    1.000000
70      0.086317
153     0.083726
205     0.083196
219     0.081054
          ...   
27     -0.052935
220    -0.053958
170    -0.054601
2      -0.055732
29     -0.059555
Name: 2990, Length: 2991, dtype: float64