In [2]:
# Competition: https://www.kaggle.com/c/nlp-getting-started/data?select=train.cs

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import argparse

# Parameters

In [4]:
DATASET_ROOT = './tweets/'
TRAIN_PATH = os.path.join(DATASET_ROOT, 'train.csv')
TEST_PATH = os.path.join(DATASET_ROOT, 'test.csv')

args = argparse.Namespace(
    
)

In [5]:
train_df = pd.read_csv(TRAIN_PATH, index_col='id')

train_df.head()


Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test_df = pd.read_csv(TEST_PATH, index_col = 'id')

test_df.head()

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Data shape and missing values

In [7]:
print('Train shape ', train_df.shape)
print('Test shape ', test_df.shape)

print("\nTrain dataset missing values")
print(train_df.isnull().sum())
print('\nUnique values in location')

train_values = train_df['location'].value_counts(dropna = True, sort = True)
print(train_values)

print('\nUnique values in keywords')
print(train_df['keyword'].value_counts(dropna = True, sort = True))

print("\nTest dataset missing values")
print(test_df.isnull().sum())

print()


Train shape  (7613, 4)
Test shape  (3263, 3)

Train dataset missing values
keyword       61
location    2533
text           0
target         0
dtype: int64

Unique values in location
USA                         104
New York                     71
United States                50
London                       45
Canada                       29
                           ... 
Somewhere out there           1
? Jet Life ?                  1
Louavul, KY                   1
Land Of The Kings             1
THE WORLD T.G.G / M.M.M       1
Name: location, Length: 3341, dtype: int64

Unique values in keywords
fatalities               45
deluge                   42
armageddon               42
harm                     41
body%20bags              41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

Test dataset missing values
keyword       26


# Process the tweets

In [15]:
import re
import random

def clean_text (text):
    text = re.sub(r"([^a-zA-Z']+)|(https?:\S+)|(www.\S+)",  r" ", text)
    return text

r = random.randint(0, train_df.shape[0])

sample = train_df['text'].iloc[r]

print('Original sample: ', sample)

sample = clean_text(sample)

print('Sample cleaned: ', sample)



Original sample:  WWI WWII JAPANESE ARMY NAVY MILITARY JAPAN LEATHER WATCH WAR MIDO WW1 2 - Full read by eBay http://t.co/obfD7e4QcP http://t.co/yAZjE5OwVk
Sample cleaned:  WWI WWII JAPANESE ARMY NAVY MILITARY JAPAN LEATHER WATCH WAR MIDO WW Full read by eBay    


In [17]:
train_df['text'].apply(clean_text)

test_df['text'].apply(clean_text)

id
0                       Just happened a terrible car crash
2        Heard about earthquake is different cities sta...
3        there is a forest fire at spot pond geese are ...
9                    Apocalypse lighting Spokane wildfires
11              Typhoon Soudelor kills in China and Taiwan
                               ...                        
10861    EARTHQUAKE SAFETY LOS ANGELES SAFETY FASTENERS...
10865    Storm in RI worse than last hurricane My city ...
10868                   Green Line derailment in Chicago  
10874           MEG issues Hazardous Weather Outlook HWO  
10875     CityofCalgary has activated its Municipal Eme...
Name: text, Length: 3263, dtype: object

# Vectorize the text

In [18]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [22]:
#Process training first, test set will be processed the same way but it comes later
count_vectorizer = feature_extraction.text.CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train_df['text'])

print('Train vector shape ', train_vectors.shape)

Train vector shape  (7613, 21637)


In [37]:
r = random.randint(0, train_vectors.shape[0])
print('Converted \n', train_vectors[r])
print(type(train_vectors[r]))
print('\n Original training sample \n', train_df.iloc[r])

Converted 
   (0, 2192)	1
  (0, 1984)	1
  (0, 18659)	1
  (0, 12929)	1
  (0, 11176)	1
  (0, 9304)	1
  (0, 4517)	1
  (0, 12191)	2
  (0, 2358)	1
  (0, 19703)	1
  (0, 20770)	1
  (0, 6195)	1
  (0, 18828)	1
  (0, 20369)	1
  (0, 4273)	1
  (0, 19335)	1
<class 'scipy.sparse.csr.csr_matrix'>

 Original training sample 
 keyword                                                 drown
location                                        @notoriousD12
text        Throw that water at me until I drown and my la...
target                                                      0
Name: 4159, dtype: object


# Building models 

In [38]:
from sklearn import model_selection, linear_model


In [40]:
clf = linear_model.RidgeClassifier()

scores = model_selection.cross_val_score(clf, train_vectors, train_df['target'], cv = 10, scoring='f1')

print(scores)



[0.6148532  0.48       0.44267516 0.44748858 0.52058824 0.56426332
 0.49911504 0.47670251 0.62611276 0.69101124]


# NOTE!! DO NOT TOUCH THE TEST SET BEFORE YOU'RE CONFIDENT

In [41]:
#Final validation here