In [1]:
"""
Kaggle NLP Tutorial
[https://www.kaggle.com/philculliton/nlp-getting-started-tutorial/data?select=train.csv]
"""

'\nKaggle NLP Tutorial\n[https://www.kaggle.com/philculliton/nlp-getting-started-tutorial/data?select=train.csv]\n'

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [5]:
train_df = pd.read_csv("../data/tutorial/train.csv")

In [6]:
test_df = pd.read_csv("../data/tutorial/test.csv")

In [7]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [12]:
train_df["target"]

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [14]:
train_df["target"]==0

0       False
1       False
2       False
3       False
4       False
        ...  
7608    False
7609    False
7610    False
7611    False
7612    False
Name: target, Length: 7613, dtype: bool

In [15]:
train_df[train_df["target"]==0]

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0
...,...,...,...,...,...
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0
7582,10834,wrecked,,Cramer: Iger's 3 words that wrecked Disney's s...,0
7584,10837,,,These boxes are ready to explode! Exploding Ki...,0
7587,10841,,,Sirens everywhere!,0


In [18]:
train_df[train_df["target"]==0].values[0]

array([23, nan, nan, "What's up man?", 0], dtype=object)

In [19]:
train_df[train_df["target"]==0].values[1]

array([24, nan, nan, 'I love fruits', 0], dtype=object)

In [20]:
train_df[train_df["target"]==0]["text"].values[1]

'I love fruits'

In [27]:
train_df[train_df["target"]==1]["text"].values[1]

'Forest fire near La Ronge Sask. Canada'

In [28]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [29]:
# get counts for a few tweets in the data

In [30]:
train_df["text"][0:5]

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [31]:
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [32]:
example_train_vectors

<5x54 sparse matrix of type '<class 'numpy.int64'>'
	with 61 stored elements in Compressed Sparse Row format>

In [34]:
print(example_train_vectors)

  (0, 34)	1
  (0, 12)	1
  (0, 5)	1
  (0, 49)	1
  (0, 39)	1
  (0, 29)	1
  (0, 50)	1
  (0, 13)	1
  (0, 25)	1
  (0, 4)	1
  (0, 18)	1
  (0, 52)	1
  (0, 3)	1
  (1, 17)	1
  (1, 16)	1
  (1, 26)	1
  (1, 24)	1
  (1, 42)	1
  (1, 44)	1
  (1, 11)	1
  (2, 5)	2
  (2, 3)	1
  (2, 41)	1
  (2, 7)	1
  (2, 51)	1
  :	:
  (2, 32)	1
  (2, 15)	1
  (3, 21)	1
  (3, 14)	1
  (3, 32)	1
  (3, 1)	1
  (3, 0)	1
  (3, 35)	1
  (3, 40)	1
  (3, 53)	1
  (3, 10)	1
  (4, 50)	1
  (4, 53)	1
  (4, 23)	1
  (4, 20)	1
  (4, 46)	1
  (4, 36)	1
  (4, 19)	2
  (4, 43)	1
  (4, 2)	1
  (4, 6)	1
  (4, 48)	1
  (4, 38)	1
  (4, 22)	1
  (4, 45)	1


In [36]:
print(example_train_vectors[0])

  (0, 34)	1
  (0, 12)	1
  (0, 5)	1
  (0, 49)	1
  (0, 39)	1
  (0, 29)	1
  (0, 50)	1
  (0, 13)	1
  (0, 25)	1
  (0, 4)	1
  (0, 18)	1
  (0, 52)	1
  (0, 3)	1


In [37]:
# we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)

In [39]:
print(example_train_vectors[0].todense().shape)

(1, 54)


In [40]:
print(example_train_vectors[0].todense())

[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [41]:
"""
The above tells us that:
1. There are 54 unique words ("tokens") in the first 5 tweets.
2. The first tweet contains only some of those unique tokens - all of the non-zero counts above are the tokens that DO exist in the first tweet
"""

'\nThe above tells us that:\n1. There are 54 unique words ("tokens") in the first 5 tweets.\n2. The first tweet contains only some of those unique tokens - all of the non-zero counts above are the tokens that DO exist in the first tweet\n'

In [56]:
# All non-zero tokens (unique words) below are for tweet no. 4 ONLY.
print(example_train_vectors.todense()[3])

[[1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1]]


In [57]:
# All non-zero tokens (unique words) below are for tweet no. 5 ONLY.
print(example_train_vectors.todense()[4])

[[0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 1 0 0 0 0 1 0 1 1 0 1 0 1 0 0 1]]


In [58]:
# All non-zero tokens (unique words) below are for tweet no. 3 ONLY.
print(example_train_vectors.todense()[2])

[[0 0 0 1 0 2 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 2 0 0 0 0 0 1 1 0 1 1 1 1 0 0
  0 2 0 0 0 1 0 0 0 0 0 2 0 0 0 1 0 0]]


In [59]:
# Now we create vectors for all of our tweets

In [60]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

In [61]:
"""
We are not using the .fit_transform() here.
Using just the .transform() makes sure that the tokens in the *train* 
vectors are the *only tokens* mapped to the *test* vectors.
This means that the train and the test vectors use the same set of
tokens (which makes sense).
"""
test_vectors = count_vectorizer.transform(test_df["text"])

In [63]:
"""
Our Model:
According to our model, words contained in each tweet are a good 
indicator of whether they're about a real disaster or not. The
presence of a particular word (or a set of words) in a tweet might
link directly to whether or not that tweet is real.

This points to an assumption of a liner connection between tokens and
occurance of disaster. Hence we will being a liner model.
"""

"\nOur Model:\nAccording to our model, words contained in each tweet are a good \nindicator of whether they're about a real disaster or not. The\npresence of a particular word (or a set of words) in a tweet might\nlink directly to whether or not that tweet is real.\n\nThis points to an assumption of a liner connection between tokens and\noccurance of disaster. Hence we will being a liner model.\n"

In [64]:
"""
Our vectors are really big, so we want to push our model's weights 
towards 0 without completely discounting different words. Ridge
Regression is a good way to do this.
"""
clf = linear_model.RidgeClassifier()

In [66]:
"""
We now need to test our model and see how well it performs on
our training data. For this we will use 'cross-validation' - where
we train on a portion of the known data, and then validate it with the
rest. If we do this several times (with different portions), we can
get a good idea for how a particular model or method performs.

The metric for this competition is 'F1', so we will use that here.
"""
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")

In [69]:
print(scores)

[0.59421842 0.56455572 0.64149093]


In [71]:
"""
The above scores are good. It seems like our assumption of liner
relationship and that of using the RidgeClassifier() will score about
64% on the leaderboard. There are lots of ways to improve on this,
including: TFIDF, LSA, LSTM/RNNs, etc. I will focus on XLNet soon.

For now, we will do the predictions on our training set and build a
submission for the competition.
"""

'\nThe above scores are good. It seems like our assumption of liner\nrelationship and that of using the RidgeClassifier() will score about\n64% on the leaderboard. There are lots of ways to improve on this,\nincluding: TFIDF, LSA, LSTM/RNNs, etc. I will focus on XLNet soon.\n\nFor now, we will do the predictions on our training set and build a\nsubmission for the competition.\n'

In [73]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [74]:
sample_submission = pd.read_csv("../data/tutorial/sample_submission.csv")

In [75]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [79]:
sample_submission["target"]

0       0
1       0
2       0
3       0
4       0
       ..
3258    0
3259    0
3260    0
3261    0
3262    0
Name: target, Length: 3263, dtype: int64

In [83]:
print(clf.predict(test_vectors))

[0 1 1 ... 1 1 0]


In [84]:
sample_submission["target"] = clf.predict(test_vectors)

In [85]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [88]:
sample_submission.to_csv("../data/tutorial/submission_NLPTutorial.csv", index=False)