In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')
print(df.head())

                    qid                                      question_text  \
0  00002165364db923c7e6  How did Quebec nationalists see their province...   
1  000032939017120e6e44  Do you have an adopted dog, how would you enco...   
2  0000412ca6e4628ce2cf  Why does velocity affect time? Does velocity a...   
3  000042bf85aa498cd78e  How did Otto von Guericke used the Magdeburg h...   
4  0000455dfa3e01eae3af  Can I convert montra helicon D to a mountain b...   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [None]:
import nltk
nltk.download('wordnet')
import re
from nltk.corpus import stopwords

In [3]:
df['target'].value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [5]:
X = df['question_text']
y = df['target']
print(X.head())

0    How did Quebec nationalists see their province...
1    Do you have an adopted dog, how would you enco...
2    Why does velocity affect time? Does velocity a...
3    How did Otto von Guericke used the Magdeburg h...
4    Can I convert montra helicon D to a mountain b...
Name: question_text, dtype: object


In [None]:
# defining a function to preprocess text
from nltk.stem import LancasterStemmer, WordNetLemmatizer
lemma=WordNetLemmatizer()

def preprocess(doc):
  doc = re.sub(r'\W', ' ',str(doc))
  doc = doc.lower()                 # Converting to lowercase
  cleanr = re.compile('<.*?>')
  doc = re.sub(cleanr, ' ',str(doc))        #Removing HTML tags
  doc = re.sub(r'[?|!|\'|"|#]',r'',str(doc))
  doc = re.sub(r'[.|,|)|(|\|/]',r' ',str(doc))
  doc=re.sub(r'\s+', ' ',str(doc),flags=re.I)
  doc=re.sub(r'^b\s+', ' ',str(doc))
  doc = re.sub(r'\[[0-9]*\]', ' ', doc)
  doc = re.sub(r'\s+', ' ',doc)
  # Removing special characters and digits
  doc = re.sub('[^a-zA-Z]', ' ', doc )
  doc = re.sub(r'\s+', ' ', doc)
  #doc_list = nltk.sent_tokenize(doc)
  stopwords = nltk.corpus.stopwords.words('english')
  #Lemmatization
  tokens=doc.split()
  tokens=[lemma.lemmatize(word) for word in tokens]
  tokens=[word for word in tokens if word not in stopwords]
  
  return tokens

In [None]:
# preprocessing the text
messages = X.copy()
corpus=[]
for i in range(len(messages)) :
    tokens=preprocess(messages['question_text'][i])
    tokens=' '.join(tokens)
    corpus.append(tokens)


In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [None]:
# finding the maximum length
max_length = max([len(corpus[i].split(' ')) for i in range(len(corpus))])
max_length

In [None]:
# calculating the vocab size
vocab = set()

for i in range(len(corpus)):
    tokens = corpus[i].split(' ')
    vocab.update(tokens)

In [None]:
# defining the vocab size
vocab_size = len(vocab)

# one-hot encoding the messages
one_hot_encoding = [one_hot(sent, vocab_size) for sent in corpus]

In [None]:
# padding the sequences to the same length
padded_sentences = pad_sequences(one_hot_encoding, padding = 'pre', maxlen = max_length)

print(padded_sentences)

In [None]:
# creating our benchmark model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length = max_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
# creating second deep learning model for comparing the performance
model1=Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features, input_length = max_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

In [None]:
# cretaing stacked LSTM model for comparison
model2=Sequential()
model2.add(Embedding(vocab_size, embedding_vector_features, input_length = max_length))
model2.add(LSTM(50,return_sequences=True))
model2.add(LSTM(50,return_sequences=True))
model2.add(LSTM(50))
model2.add(Dense(1))
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model2.summary())

In [None]:
import numpy as np

# creating our final dataset for training the model
X_final = np.array(padded_sentences)
y_final = np.array(y)

(X_final.shape, y_final.shape)

In [None]:
from sklearn.model_selection import train_test_split

# splitting the data into training, validation and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.2, random_state = 123)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 123)

In [None]:
# training the model and validating on the validation set
history = model2.fit(X_train, y_train, validation_data = (X_val, y_val),epochs = 10,batch_size = 64, verbose = 1)

In [None]:
import matplotlib.pyplot as plt

# plotting the model history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
# making predictions on the test set
y_pred1 = model2.predict_classes(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_pred1)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)