In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
pd.set_option('display.max_colwidth', None)

# First approach - CountVectorizer (model: Lineal)

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


In [None]:
#What CountVectorizer does is to create a "vocabulary list" with all the words used in all the given data (all the words of all sencente: twitts) in this case it makes 
# a total of 21637 words. Afterwards the function compares this vocabulary list with every sentence and set 1 when the word of the sentence appears in the vocabulary
# list (it can be 2 if the word appear twice and so on). This means we will have for every sentence a lenth of 21637.

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [None]:
#we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print('this is the first sentence:  ',train_df["text"][0])
print('\nthese are the dimensions of the vector of the first sentence: ',example_train_vectors[0].todense().shape)
print('\nthis is the vector for the first sentence: \n',example_train_vectors[0].todense())
print('\nthis is the vocabulary list for the 5 first sentences (same dimensions that vector): \n',count_vectorizer.vocabulary_)

this is the first sentence:   Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all

these are the dimensions of the vector of the first sentence:  (1, 54)

this is the vector for the first sentence: 
 [[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]

this is the vocabulary list for the 5 first sentences (same dimensions that vector): 
 {'our': 34, 'deeds': 12, 'are': 5, 'the': 49, 'reason': 39, 'of': 29, 'this': 50, 'earthquake': 13, 'may': 25, 'allah': 4, 'forgive': 18, 'us': 52, 'all': 3, 'forest': 17, 'fire': 16, 'near': 26, 'la': 24, 'ronge': 42, 'sask': 44, 'canada': 11, 'residents': 41, 'asked': 7, 'to': 51, 'shelter': 47, 'in': 21, 'place': 37, 'being': 8, 'notified': 28, 'by': 9, 'officers': 30, 'no': 27, 'other': 33, 'evacuation': 14, 'or': 31, 'orders': 32, 'expected': 15, '13': 1, '000': 0, 'people': 35, 'receive': 40, 'wildfires': 53, 'california': 10, 'just': 23, 'got': 20, 'sent': 46, 'photo

In [None]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [None]:
print(train_vectors.todense().shape)
print(test_vectors.todense().shape)

(7613, 21637)
(3263, 21637)


In [None]:
print(train_vectors[0].todense().shape)
print(train_vectors[0].todense())
#print('\nthis is the vocabulary list all sentences (same dimensions that vector): \n',count_vectorizer.vocabulary_)

(1, 21637)
[[0 0 0 ... 0 0 0]]


In [None]:
count_vectorizer.transform(test_df["text"]).todense().shape

(3263, 21637)

In [None]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [None]:
print(train_vectors.shape)
print(train_df["target"].shape)

(7613, 21637)
(7613,)


In [None]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59453669, 0.56455572, 0.64082434])

In [None]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [None]:
sample_submission = pd.read_csv("sample_submission.csv")


In [None]:
sample_submission["target"] = clf.predict(test_vectors)

In [None]:
sample_submission["target"]

0       0
1       1
2       1
3       0
4       1
       ..
3258    1
3259    1
3260    1
3261    1
3262    0
Name: target, Length: 3263, dtype: int64

In [None]:
sample_submission.to_csv("sample_submission_CV.csv",index=False)

In [None]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


# Second approach - TF-IDF (model: Lineal)

In [None]:
# What TF-IDF does is to create a "vocabulary vector" with all the words used in all the given data (all the words of all sencente: twitts) in this case it makes 
# a total of 21637 words. Afterwards the function compares this "vocabulary vector" with every sentence and set a number of importance (IDF value)
# to every word of the "vocabulary vector" in the respect of the analysed sentence

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


In [None]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
train_vectors = tfIdfVectorizer.fit_transform(train_df["text"])
print('train_vectors: ',train_vectors.shape)
test_vectors = tfIdfVectorizer.transform(test_df["text"])
print('test_vectors: ',test_vectors.shape)
#a=tfIdf[0].todense()
#print(a)

train_vectors:  (7613, 21637)
test_vectors:  (3263, 21637)


In [None]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

##### Linear model

In [None]:
clf = linear_model.RidgeClassifier()

In [None]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.63366337, 0.6122449 , 0.68442211])

In [None]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [None]:
sample_submission = pd.read_csv("sample_submission.csv")

In [None]:
sample_submission["target"] = clf.predict(test_vectors)

In [None]:
clf.predict(test_vectors)

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,

In [None]:
sample_submission.to_csv("sample_submission_TFIDF.csv",index=False)

# Third approach - TF-IDF (model: Searching best model with Lazypredict )


In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X = pd.DataFrame(train_vectors.todense())
y = pd.DataFrame(train_df["target"])

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,random_state =123)

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████| 29/29 [2:00:29<00:00, 249.29s/it]  

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
BernoulliNB                        0.81               0.79     0.79      0.80   
NearestCentroid                    0.80               0.78     0.78      0.79   
LGBMClassifier                     0.79               0.77     0.77      0.79   
ExtraTreesClassifier               0.79               0.77     0.77      0.79   
XGBClassifier                      0.79               0.76     0.76      0.78   
NuSVC                              0.78               0.76     0.76      0.77   
PassiveAggressiveClassifier        0.77               0.75     0.75      0.76   
Perceptron                         0.77               0.75     0.75      0.76   
LogisticRegression                 0.77               0.75     0.75      0.76   
LinearSVC                          0.76               0.74     0.74      0.76   
RidgeClassifierCV           




##### We have seen that the best model according to lazypredict is BernoulliNB, lets try it

In [None]:
from sklearn.naive_bayes import BernoulliNB
X = pd.DataFrame(train_vectors.todense())
y = pd.DataFrame(train_df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,random_state =123)

clf = BernoulliNB()
clf.fit(X_train,y_train)
test_prediction = clf.predict(X_test)
test_realvalues = y_test['target'].to_numpy()


In [None]:
#Now we compare the predicted values with the real values on the test section.
result=test_prediction==test_realvalues #this makes a true/false matrix with the succesful/unsuccessful predicted values
len(result) # these are all the observation in test
np.sum(result)  # these are all the true predicted values 
accuracy=np.sum(result)/len(result)
print(accuracy) # The accuracy should be similar to the obtanied in Lazyclassifier (0.81)

0.8082731451083388


In [None]:
#Now we will train the model again with the dataset complete not only with the 80% of it
X_train = tfIdfVectorizer.fit_transform(train_df["text"])
X_test = tfIdfVectorizer.transform(test_df["text"])
clf = BernoulliNB()
clf.fit(train_vectors.todense(),train_df['target'])


# now we will predict the test_df 
X_test=pd.DataFrame(X_test.todense())
test_prediction = clf.predict(X_test)


In [None]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
sample_submission["target"] = test_prediction

In [None]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
sample_submission.to_csv("sample_submission_TFIDF_BernouilliNB.csv",index=False)
#The result obtained in Kaggle is 0.79528 wich is less than the obtained with the lineal model. (0.80049)

# Fourth approach -RNN

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [5]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


In [4]:
from tensorflow import keras
print(keras.__version__)
print(tf.__version__)


2.9.0
2.9.0


In [7]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.text.values, train_df.target.values, 
                                                  stratify=train_df.target.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)


In [20]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 35 #the maximum value of words (tokens) in the twitts is 33 that's why we set it to 35

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)
test_seq=token.texts_to_sequences(test_df.text.values)

#zero pad the sequences
xtrain_pad = tf.keras.utils.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = tf.keras.utils.pad_sequences(xvalid_seq, maxlen=max_len)
test_pad = tf.keras.utils.pad_sequences(test_seq, maxlen=max_len)
word_index = token.word_index

In [41]:
%%time
# A simpleRNN without any pretrained embeddings and one dense layer
model_val = Sequential()
model_val.add(Embedding(len(word_index) + 1,
                 300,
                     input_length=max_len))
model_val.add(SimpleRNN(100))
model_val.add(Dense(1, activation='sigmoid'))
model_val.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model_val.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 35, 300)           6810300   
                                                                 
 simple_rnn_6 (SimpleRNN)    (None, 100)               40100     
                                                                 
 dense_6 (Dense)             (None, 1)                 101       
                                                                 
Total params: 6,850,501
Trainable params: 6,850,501
Non-trainable params: 0
_________________________________________________________________
CPU times: user 261 ms, sys: 8.8 ms, total: 269 ms
Wall time: 236 ms


In [42]:
model_val.fit(xtrain_pad, ytrain,validation_data=(xvalid_pad, yvalid), epochs=30 , batch_size=64,callbacks = EarlyStopping(patience=5))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


<keras.callbacks.History at 0x7fccf476cbd0>

In [43]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [44]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.81%


In [45]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

In [46]:
#Finally we will predict the test to upload the resuts to kaggle
test_predict=model.predict(test_pad)



In [47]:
test_predict.shape

(3263, 1)

In [50]:
test_predict

array([[0.06758979],
       [0.8371589 ],
       [0.96197146],
       ...,
       [0.9950584 ],
       [0.9998642 ],
       [0.7673651 ]], dtype=float32)

In [51]:
# We adopt the correct format (1 or 0)
test_predict_format=[1 if value[0]>0.5 else 0 for value in test_predict]
test_predict_format=np.array(test_predict_format)

In [52]:
test_predict_format

array([0, 1, 1, ..., 1, 1, 1])

In [54]:
sample_submission = pd.read_csv("/content/sample_submission.csv")
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [55]:
sample_submission["target"] = test_predict_format

In [57]:
sample_submission.to_csv("sample_submission_RNN.csv",index=False)

# Fifth approach -LSTM (Using pre-trained model GloVe)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


In [4]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [03:35, 10212.92it/s]

Found 2196016 word vectors.





In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.text.values, train_df.target.values, 
                                                  stratify=train_df.target.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [7]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 35 #the maximum value of words (tokens) in the twitts is 33 that's why we set it to 35

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)
test_seq=token.texts_to_sequences(test_df.text.values)

#zero pad the sequences
xtrain_pad = tf.keras.utils.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = tf.keras.utils.pad_sequences(xvalid_seq, maxlen=max_len)
test_pad = tf.keras.utils.pad_sequences(test_seq, maxlen=max_len)
word_index = token.word_index

In [8]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 22700/22700 [00:00<00:00, 420400.84it/s]


In [9]:
%%time

# A simple LSTM with glove embeddings and one dense layer
model = Sequential()
model.add(Embedding(len(word_index) + 1,
             300,
             weights=[embedding_matrix],
             input_length=max_len,
             trainable=False))

model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 35, 300)           6810300   
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 6,970,801
Trainable params: 160,501
Non-trainable params: 6,810,300
_________________________________________________________________
Wall time: 218 ms


In [11]:
model.fit(xtrain_pad, ytrain,validation_data=(xvalid_pad, yvalid), epochs=30 , batch_size=64,callbacks = EarlyStopping(patience=5))


Epoch 1/30
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


<keras.callbacks.History at 0x1b17e043f08>

In [13]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [14]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.88%


In [16]:
#Finally we will predict the test to upload the resuts to kaggle
test_predict=model.predict(test_pad)



In [17]:
test_predict.shape

(3263, 1)

In [18]:
test_predict

array([[0.6043087 ],
       [0.97589767],
       [0.9783032 ],
       ...,
       [0.9922785 ],
       [0.80060834],
       [0.69054145]], dtype=float32)

In [19]:
# We adopt the correct format (1 or 0)
test_predict_format=[1 if value[0]>0.5 else 0 for value in test_predict]
test_predict_format=np.array(test_predict_format)

In [22]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [23]:
sample_submission["target"] = test_predict_format

In [24]:
sample_submission.to_csv("sample_submission_LSTM_from_Pretrained_model.csv",index=False)
# We have obtained 0.79068 on Kaggle not improving our best result yet: 0.80049 TF-IDF with Lineal model

# Sixth approach -GRU

In [56]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.text.values, train_df.target.values, 
                                                  stratify=train_df.target.values, 
                                                  random_state=40, 
                                                  test_size=0.2, shuffle=True)

In [57]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 35 #the maximum value of words (tokens) in the twitts is 33 that's why we set it to 35

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)
test_seq=token.texts_to_sequences(test_df.text.values)

#zero pad the sequences
xtrain_pad = tf.keras.utils.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = tf.keras.utils.pad_sequences(xvalid_seq, maxlen=max_len)
test_pad = tf.keras.utils.pad_sequences(test_seq, maxlen=max_len)
word_index = token.word_index

In [46]:
%%time

# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                300,
                weights=[embedding_matrix],
                input_length=max_len,
                trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300))
model.add(Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 35, 300)           6810300   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 35, 300)          0         
 lDropout1D)                                                     
                                                                 
 gru_1 (GRU)                 (None, 300)               541800    
                                                                 
 dense_2 (Dense)             (None, 1)                 301       
                                                                 
Total params: 7,352,401
Trainable params: 542,101
Non-trainable params: 6,810,300
_________________________________________________________________
Wall time: 253 ms


In [47]:
model.fit(xtrain_pad, ytrain,validation_data=(xvalid_pad, yvalid), epochs=30 , batch_size=64,callbacks = EarlyStopping(patience=5))


Epoch 1/30
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/

<keras.callbacks.History at 0x1b2a8aedec8>

In [48]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Auc: 0.89%


In [49]:
#Finally we will predict the test to upload the resuts to kaggle
test_predict=model.predict(test_pad)



In [50]:
# We adopt the correct format (1 or 0)
test_predict_format=[1 if value[0]>0.5 else 0 for value in test_predict]
test_predict_format=np.array(test_predict_format)

In [51]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [52]:
sample_submission["target"] = test_predict_format

In [55]:
sample_submission.to_csv("sample_submission_GRU_from_Pretrained_model_relu.csv",index=False)
# We have obtained 0.79926 on Kaggle not improving our best result yet: 0.80049 TF-IDF with Lineal model

# Seventh approach - RNN Bi-directional

In [59]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train_df.text.values, train_df.target.values, 
                                                  stratify=train_df.target.values, 
                                                  random_state=40, 
                                                  test_size=0.2, shuffle=True)

In [60]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 35 #the maximum value of words (tokens) in the twitts is 33 that's why we set it to 35

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)
test_seq=token.texts_to_sequences(test_df.text.values)

#zero pad the sequences
xtrain_pad = tf.keras.utils.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = tf.keras.utils.pad_sequences(xvalid_seq, maxlen=max_len)
test_pad = tf.keras.utils.pad_sequences(test_seq, maxlen=max_len)
word_index = token.word_index

In [63]:
%%time
# A simple bidirectional LSTM with glove embeddings and one dense layer
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                300,
                weights=[embedding_matrix],
                input_length=max_len,
                trainable=False))
model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 35, 300)           6810300   
                                                                 
 bidirectional_1 (Bidirectio  (None, 600)              1442400   
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 1)                 601       
                                                                 
Total params: 8,253,301
Trainable params: 1,443,001
Non-trainable params: 6,810,300
_________________________________________________________________
Wall time: 281 ms


In [64]:
model.fit(xtrain_pad, ytrain,validation_data=(xvalid_pad, yvalid), epochs=30 , batch_size=64,callbacks = EarlyStopping(patience=5))

Epoch 1/30
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


<keras.callbacks.History at 0x1b29b096448>

In [65]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Auc: 0.80%


In [66]:
#Finally we will predict the test to upload the resuts to kaggle
test_predict=model.predict(test_pad)



In [67]:
# We adopt the correct format (1 or 0)
test_predict_format=[1 if value[0]>0.5 else 0 for value in test_predict]
test_predict_format=np.array(test_predict_format)

In [68]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [69]:
sample_submission["target"] = test_predict_format

In [71]:
sample_submission.to_csv("sample_submission_RNN_Bi-directional_from_Pretrained_model_sigmoid.csv",index=False)
# We have obtained 0.69138 on Kaggle not improving our best result yet: 0.80049 TF-IDF with Lineal model