In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
pd.set_option('display.max_colwidth', None)

# First approach - CountVectorizer (model: Lineal)

In [11]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


In [12]:
#What CountVectorizer does is to create a "vocabulary list" with all the words used in all the given data (all the words of all sencente: twitts) in this case it makes 
# a total of 21637 words. Afterwards the function compares this vocabulary list with every sentence and set 1 when the word of the sentence appears in the vocabulary
# list (it can be 2 if the word appear twice and so on). This means we will have for every sentence a lenth of 21637.

In [13]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [14]:
#we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print('this is the first sentence:  ',train_df["text"][0])
print('\nthese are the dimensions of the vector of the first sentence: ',example_train_vectors[0].todense().shape)
print('\nthis is the vector for the first sentence: \n',example_train_vectors[0].todense())
print('\nthis is the vocabulary list for the 5 first sentences (same dimensions that vector): \n',count_vectorizer.vocabulary_)

this is the first sentence:   Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all

these are the dimensions of the vector of the first sentence:  (1, 54)

this is the vector for the first sentence: 
 [[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]

this is the vocabulary list for the 5 first sentences (same dimensions that vector): 
 {'our': 34, 'deeds': 12, 'are': 5, 'the': 49, 'reason': 39, 'of': 29, 'this': 50, 'earthquake': 13, 'may': 25, 'allah': 4, 'forgive': 18, 'us': 52, 'all': 3, 'forest': 17, 'fire': 16, 'near': 26, 'la': 24, 'ronge': 42, 'sask': 44, 'canada': 11, 'residents': 41, 'asked': 7, 'to': 51, 'shelter': 47, 'in': 21, 'place': 37, 'being': 8, 'notified': 28, 'by': 9, 'officers': 30, 'no': 27, 'other': 33, 'evacuation': 14, 'or': 31, 'orders': 32, 'expected': 15, '13': 1, '000': 0, 'people': 35, 'receive': 40, 'wildfires': 53, 'california': 10, 'just': 23, 'got': 20, 'sent': 46, 'photo

In [15]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [16]:
print(train_vectors.todense().shape)
print(test_vectors.todense().shape)

(7613, 21637)
(3263, 21637)


In [17]:
print(train_vectors[0].todense().shape)
print(train_vectors[0].todense())
#print('\nthis is the vocabulary list all sentences (same dimensions that vector): \n',count_vectorizer.vocabulary_)

(1, 21637)
[[0 0 0 ... 0 0 0]]


In [18]:
count_vectorizer.transform(test_df["text"]).todense().shape

(3263, 21637)

In [19]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [20]:
print(train_vectors.shape)
print(train_df["target"].shape)

(7613, 21637)
(7613,)


In [21]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59453669, 0.56455572, 0.64082434])

In [22]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [23]:
sample_submission = pd.read_csv("sample_submission.csv")


In [24]:
sample_submission["target"] = clf.predict(test_vectors)

In [25]:
sample_submission["target"]

0       0
1       1
2       1
3       0
4       1
       ..
3258    1
3259    1
3260    1
3261    1
3262    0
Name: target, Length: 3263, dtype: int64

In [26]:
sample_submission.to_csv("sample_submission_CV.csv",index=False)

In [27]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


# Second approach - TF-IDF (model: Lineal)

In [28]:
# What TF-IDF does is to create a "vocabulary vector" with all the words used in all the given data (all the words of all sencente: twitts) in this case it makes 
# a total of 21637 words. Afterwards the function compares this "vocabulary vector" with every sentence and set a number of importance (IDF value)
# to every word of the "vocabulary vector" in the respect of the analysed sentence

In [29]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


In [30]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
train_vectors = tfIdfVectorizer.fit_transform(train_df["text"])
print('train_vectors: ',train_vectors.shape)
test_vectors = tfIdfVectorizer.transform(test_df["text"])
print('test_vectors: ',test_vectors.shape)
#a=tfIdf[0].todense()
#print(a)

train_vectors:  (7613, 21637)
test_vectors:  (3263, 21637)


In [31]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

##### Linear model

In [32]:
clf = linear_model.RidgeClassifier()

In [33]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.63366337, 0.6122449 , 0.68442211])

In [34]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [35]:
sample_submission = pd.read_csv("sample_submission.csv")

In [36]:
sample_submission["target"] = clf.predict(test_vectors)

In [37]:
clf.predict(test_vectors)

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,

In [38]:
sample_submission.to_csv("sample_submission_TFIDF.csv",index=False)

# Third approach - TF-IDF (model: Searching best model with Lazypredict )


In [40]:
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X = pd.DataFrame(train_vectors.todense())
y = pd.DataFrame(train_df["target"])

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,random_state =123)

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████| 29/29 [2:00:29<00:00, 249.29s/it]  

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
BernoulliNB                        0.81               0.79     0.79      0.80   
NearestCentroid                    0.80               0.78     0.78      0.79   
LGBMClassifier                     0.79               0.77     0.77      0.79   
ExtraTreesClassifier               0.79               0.77     0.77      0.79   
XGBClassifier                      0.79               0.76     0.76      0.78   
NuSVC                              0.78               0.76     0.76      0.77   
PassiveAggressiveClassifier        0.77               0.75     0.75      0.76   
Perceptron                         0.77               0.75     0.75      0.76   
LogisticRegression                 0.77               0.75     0.75      0.76   
LinearSVC                          0.76               0.74     0.74      0.76   
RidgeClassifierCV           




##### We have seen that the best model according to lazypredict is BernoulliNB, lets try it

In [41]:
from sklearn.naive_bayes import BernoulliNB
X = pd.DataFrame(train_vectors.todense())
y = pd.DataFrame(train_df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,random_state =123)

clf = BernoulliNB()
clf.fit(X_train,y_train)
test_prediction = clf.predict(X_test)
test_realvalues = y_test['target'].to_numpy()


In [42]:
#Now we compare the predicted values with the real values on the test section.
result=test_prediction==test_realvalues #this makes a true/false matrix with the succesful/unsuccessful predicted values
len(result) # these are all the observation in test
np.sum(result)  # these are all the true predicted values 
accuracy=np.sum(result)/len(result)
print(accuracy) # The accuracy should be similar to the obtanied in Lazyclassifier (0.81)

0.8082731451083388


In [43]:
#Now we will train the model again with the dataset complete not only with the 80% of it
X_train = tfIdfVectorizer.fit_transform(train_df["text"])
X_test = tfIdfVectorizer.transform(test_df["text"])
clf = BernoulliNB()
clf.fit(train_vectors.todense(),train_df['target'])


# now we will predict the test_df 
X_test=pd.DataFrame(X_test.todense())
test_prediction = clf.predict(X_test)


In [44]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [45]:
sample_submission["target"] = test_prediction

In [46]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [47]:
sample_submission.to_csv("sample_submission_TFIDF_BernouilliNB.csv",index=False)
#The result obtained in Kaggle is 0.79528 wich is less than the obtained with the lineal model. (0.80049)

# Fourth approach -RNN

In [55]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [56]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


In [57]:
xtrain=train_df.text.values
ytrain=train_df.target.values


In [60]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain))
xtrain_seq = token.texts_to_sequences(xtrain)
test_seq=token.texts_to_sequences(test_df.text.values)

#zero pad the sequences
xtrain_pad = tf.keras.utils.pad_sequences(xtrain_seq, maxlen=max_len)
test_pad = tf.keras.utils.pad_sequences(test_seq, maxlen=max_len)
word_index = token.word_index

In [61]:
%%time
# A simpleRNN without any pretrained embeddings and one dense layer
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                 300,
                     input_length=max_len))
model.add(SimpleRNN(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         6810300   
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 6,850,501
Trainable params: 6,850,501
Non-trainable params: 0
_________________________________________________________________
Wall time: 127 ms


In [64]:
model.fit(xtrain_pad, ytrain,  batch_size=64,callbacks = EarlyStopping(patience=5))



In [None]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

In [None]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

In [None]:
scores_model

In [None]:
#Finally we will predict the test to upload the resuts to kaggle
test_predict=model.predict(test_pad)

In [None]:
test_predict.shape

In [None]:
test_predict

In [None]:
# We adopt the correct format (1 or 0)
test_predict_format=[1 if value[0]>0.5 else 0 for value in test_predict]
test_predict_format=np.array(test_predict_format)

In [None]:
test_predict_format

In [None]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission

In [None]:
sample_submission["target"] = test_predict_format

In [None]:
sample_submission.to_csv("sample_submission_RNN.csv",index=False)