In [1]:
# Importing Libraries and setting random seed
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
np.random.seed(4)

Using TensorFlow backend.


In [2]:
df_good = pd.read_csv('good.csv')
df_bad = pd.read_csv('promotional.csv')


In [3]:
print(df_good.head())
print(df_good.shape)

                                                text  \
0  Nycticebus linglom is a fossil strepsirrhine p...   
1  Oryzomys pliocaenicus is a fossil rodent from ...   
2  .hack dt hk is a series of single player actio...   
3  The You Drive Me Crazy Tour was the second con...   
4  0 8 4 is the second episode of the first seaso...   

                                                 url  
0  https://en.wikipedia.org/wiki/%3F%20Nycticebus...  
1  https://en.wikipedia.org/wiki/%3F%20Oryzomys%2...  
2  https://en.wikipedia.org/wiki/.hack%20%28video...  
3  https://en.wikipedia.org/wiki/%28You%20Drive%2...  
4                https://en.wikipedia.org/wiki/0-8-4  
(30279, 2)


In [4]:
print(df_bad.head())
print(df_bad.shape)

                                                text  advert  coi  fanpov  pr  \
0  1 Litre no Namida 1, lit. 1 Litre of Tears als...       0    0       1   0   
1  1DayLater was free, web based software that wa...       1    1       0   0   
2  1E is a privately owned IT software and servic...       1    0       0   0   
3  1Malaysia pronounced One Malaysia in English a...       1    0       0   0   
4  The Jerusalem Biennale, as stated on the Bienn...       1    0       0   0   

   resume                                                url  
0       0  https://en.wikipedia.org/wiki/1%20Litre%20no%2...  
1       0            https://en.wikipedia.org/wiki/1DayLater  
2       0                   https://en.wikipedia.org/wiki/1E  
3       0            https://en.wikipedia.org/wiki/1Malaysia  
4       0  https://en.wikipedia.org/wiki/1st%20Jerusalem%...  
(23837, 7)


It is a pretty balanced and very clean dataset, only thing needed is to add a classification label column, and split test and train sets

In [5]:
df_good = df_good.drop(df_good.columns[1:],axis=1)
df_bad = df_bad.drop(df_bad.columns[1:],axis=1)
df_good['classification']=0
df_bad['classification']=1

In [6]:
df = pd.concat((df_good,df_bad),ignore_index=True, axis=0)
df = df.reindex(np.random.permutation(df.index))
df.head()

Unnamed: 0,text,classification
48024,Scott Robinson and Charlene Mitchell are ficti...,1
23115,Robert of Cricklade c. 1100117479 was a mediev...,0
29977,The yellow lipped sea krait Laticauda colubrin...,0
48909,Veronika Scott is an American social entrepren...,1
4684,1992 1993 1994 1995 The bombing of Banski dvor...,0


In [7]:
#Spliting test & train dataset

# from sklearn.model_selection import train_test_split
predictors = df.drop('classification',axis=1)
target = df['classification']
X_train,X_test,Y_train,Y_test = train_test_split(predictors,target,test_size=0.20)

Y_train.describe()

count    43292.000000
mean         0.439250
std          0.496301
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: classification, dtype: float64

In [8]:
#Preprocessing
#Vectorizing dataset using keras Tokenizer function

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000

text_data = [str(txt) for txt in X_train.values] # convert text data to strings
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) # create tokenizer object
tokenizer.fit_on_texts(text_data) # make dictionary

X_train_vect = tokenizer.texts_to_sequences(text_data) # vectorize dataset

from keras.preprocessing import sequence

# Max number of words in each sequence
MAX_SEQUENCE_LENGTH = 400

X_train_vect = sequence.pad_sequences(X_train_vect, maxlen=MAX_SEQUENCE_LENGTH)


In [9]:
#Defining and training our LSTM using Keras

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam

model = Sequential()



EMBEDDING_DIM = 100
model.add(Embedding(MAX_NB_WORDS+1, EMBEDDING_DIM, input_length=X_train_vect.shape[1]))



model.add(LSTM(80))

model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])



EPOCHS = 2
BATCH_SIZE = 64

history = model.fit(X_train_vect, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.15)



Train on 36798 samples, validate on 6494 samples
Epoch 1/2
Epoch 2/2


In [10]:

# Prediction from a text
def bigModelEvaluate(X):
    text_data = [str(txt) for txt in X.values] # convert text data to strings
    X_vect = tokenizer.texts_to_sequences(text_data) # vectorize dataset
# Max number of words in each sequence
    MAX_SEQUENCE_LENGTH = 400
    X_vect = sequence.pad_sequences(X_vect, maxlen=MAX_SEQUENCE_LENGTH)
    Y_pred = model.predict(X_vect)
    Y_pred = [round(x[0]) for x in Y_pred]
    return Y_pred

In [11]:
Y_pred = bigModelEvaluate(X_test)

Metrics:
- Confusion matrix ( good guesses and type I/II error)
- ROC AUC (Area under receiver operator curve)
- Accuracy score

In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_pred, Y_test,labels=[0,1])

array([[4365,  656],
       [1638, 4165]])

In [13]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

roc_score = round(roc_auc_score(Y_pred,Y_test)*100,2)
acc_score=round(accuracy_score(Y_pred,Y_test)*100,2)
print('The ROC AUC Score is', roc_score, '%')
print('The Accuracy Score is', acc_score, '%')

The ROC AUC Score is 79.35 %
The Accuracy Score is 78.81 %
