In [1]:

import numpy as np
import pandas as pd

In [30]:
df = pd.read_csv('Train.csv')
dftest = pd.read_csv('Test.csv')

In [31]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [33]:
df = df.dropna()

In [34]:
X = df['text']
Y = df['target']
Xtest = dftest['text']

In [35]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Linta.Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
import re
from nltk.corpus import stopwords

stopwords_eng = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower() # convert to lowercase
    text = re.sub("[^a-z]", " ", text)
    words = [word for word in text.split() if word not in stopwords_eng]
    text = " ".join(words)
    return text

X_train = X.map(clean_text)
X_test = Xtest.map(clean_text)
X_train.head()

31    bbcmtd wholesale markets ablaze http co lhyxeo...
32    always try bring heavy metal rt http co yao e ...
33    africanbaze breaking news nigeria flag set abl...
34                                    crying set ablaze
35    plus side look sky last night ablaze http co q...
Name: text, dtype: object

In [38]:
# lemmatization
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
def lem_word(x):
    return [lem.lemmatize(w) for w in x]

In [39]:
X_train = X_train.apply(lem_word)
X_test = X_test.apply(lem_word)

In [40]:
from keras.preprocessing.text import Tokenizer
num_words = 6000
tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(X_train)

X_train_seq = pd.Series(tokenizer.texts_to_sequences(X_train))
X_test_seq = pd.Series(tokenizer.texts_to_sequences(X_test))
X_train_seq.head()

0    [18, 18, 11, 15, 3, 12, 1, 20, 13, 5, 10, 2, 7...
1    [4, 10, 20, 4, 19, 7, 1, 3, 8, 19, 1, 18, 8, 6...
2    [4, 21, 8, 6, 11, 4, 9, 18, 4, 26, 2, 1, 18, 8...
3    [11, 8, 19, 6, 9, 17, 1, 7, 2, 3, 1, 4, 18, 10...
4    [14, 10, 16, 7, 1, 7, 6, 12, 2, 1, 10, 5, 5, 2...
dtype: object

In [41]:
from keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_seq, maxlen=512)
X_test_pad = pad_sequences(X_test_seq, maxlen=512)

In [42]:
df['target'].values

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [98]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GlobalMaxPool1D, Dropout

model = Sequential()

model.add(Embedding(input_dim=num_words, output_dim=64))
model.add(LSTM(32, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()


Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 64)          384000    
_________________________________________________________________
lstm_11 (LSTM)               (None, None, 32)          12416     
_________________________________________________________________
global_max_pooling1d_13 (Glo (None, 32)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_12 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_24 (Dense)             (None, 16)                272       
_________________________________________________________________
dropout_13 (Dropout)         (None, 16)              

In [99]:
model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

batch_size = 32
epochs = 6
validation_split = 0.02
model.fit(x=X_train_pad, y=Y, batch_size=batch_size, epochs=epochs, validation_split=validation_split)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x2030704f4a8>

In [100]:
pred = model.predict(X_test_pad)

In [101]:
pred

array([[0.26226926],
       [0.2590052 ],
       [0.32551008],
       ...,
       [0.4647752 ],
       [0.49597627],
       [0.32073465]], dtype=float32)

In [112]:
y_pred = (pred >= 0.4) * 1

In [113]:
y_pred == 1

array([[False],
       [False],
       [ True],
       ...,
       [ True],
       [ True],
       [ True]])

In [114]:
dftest.id.values.shape

(3263,)

In [115]:
y_pred[:,0].shape

(3263,)

In [116]:
predictions = np.vstack((dftest.id.values,y_pred[:,0]))

In [117]:
predictions.shape

(2, 3263)

In [118]:
results = pd.DataFrame({ "id": dftest.id.values,"target": y_pred[:,0]})
results.to_csv("submission.csv")

In [119]:
results.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,0
