In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import re
import nltk
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import SimpleRNN
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

%matplotlib inline

In [2]:
seed = 8675309

In [3]:
tf.random.set_seed(seed)
np.random.seed(seed)

In [4]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [5]:
train_df.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


In [6]:
test_df.head(2)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."


In [7]:
train_df.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


In [8]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [9]:
# describe yields 10 duplicates but unique does not.
train_df["text"].unique

<bound method Series.unique of 0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object>

In [10]:
train_df.shape

(7613, 5)

In [11]:
train_df["text"][2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [12]:
train = train_df.values
test = test_df.values

In [13]:
train.shape

(7613, 5)

In [14]:
orig_X_train = train[:, 3]
orig_y_train = train[:, -1]
orig_x_test = test[:, 3]

In [15]:
X_train = train[:, 3]

In [16]:
y_train = train[:, -1]

In [17]:
X_test = test[:, 3]

In [18]:
#y_test does not exist

In [19]:
X_train[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [20]:
y_train[0]

1

In [21]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X_train)

# we will need this later
num_words = len(tokenizer.word_index)+1
X_train = tokenizer.texts_to_sequences(X_train)

In [22]:
max_log_length = 1024
X_train_processed = sequence.pad_sequences(X_train, maxlen=max_log_length)

In [23]:
# maybe use as validation set
model_X_train, model_X_test, model_y_train, model_y_test = train_test_split(X_train_processed, y_train, test_size=0.30, random_state=seed)

In [24]:
model_X_train.shape

(5329, 1024)

In [25]:
model_X_test.shape

(2284, 1024)

In [26]:
model_y_train.shape

(5329,)

In [27]:
model_y_test.shape

(2284,)

In [28]:
model_X_train[0]

array([ 0,  0,  0, ...,  9, 10, 43], dtype=int32)

In [29]:
model_X_test[0]

array([ 0,  0,  0, ...,  2, 16, 15], dtype=int32)

In [30]:
model_y_train[0]

1

In [31]:
model_y_test[0]

0

In [32]:
rnn_model = Sequential()

In [33]:
rnn_model.add(Embedding(num_words, 32, input_length=max_log_length))

In [34]:
rnn_model.add(SimpleRNN(units=32, activation='relu'))

In [35]:
rnn_model.add(Dense(units=1, activation = 'sigmoid'))

In [39]:
rnn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'f1'])

In [40]:
rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1024, 32)          3072      
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 5,185
Trainable params: 5,185
Non-trainable params: 0
_________________________________________________________________


In [42]:
rnn_model.fit(x=model_X_train, y=model_y_train, epochs=3, batch_size = 128, validation_split=0.25)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
rnn_model.evaluate(x=model_X_test, y=model_y_test, batch_size = 128)

In [None]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [None]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

In [None]:
clf.fit(train_vectors, train_df["target"])

In [None]:
to_csv("submission.csv", index=False)