# LSTM



In [1]:
# importing libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle
#from transformers import 
import numpy as np
from numpy import zeros, newaxis
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm
import random as rn
import tensorflow 

from wordcloud import WordCloud, STOPWORDS 

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

import tensorflow as tf
from tensorflow.keras.layers import Conv1D,AveragePooling1D,MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Input,concatenate,Activation,Dropout,BatchNormalization,LSTM
from tensorflow.keras import regularizers,Model
from tensorflow.keras.regularizers import l1,l2
from tensorflow.keras.callbacks import TensorBoard, Callback, EarlyStopping, ModelCheckpoint,LearningRateScheduler


from matplotlib_venn import venn2, venn2_unweighted
from matplotlib_venn import venn3, venn3_unweighted
%matplotlib inline

In [2]:
# https://stackoverflow.com/a/47091490/4084039
import re

def preprocess_text(phrase):
    # specific
    phrase = str(phrase)
    phrase = phrase.lower()
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrade = re.sub(r"wont", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"cant", "can not", phrase)
    phrase = re.sub(r"cannot", "can not", phrase)
    phrase = re.sub(r"doesn't", "does not", phrase)
    
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    phrase = phrase.replace('\\r', ' ')
    phrase = phrase.replace('\\"', ' ')
    phrase = phrase.replace('\\n', ' ')
    phrase = phrase.replace('\\', ' ')
    
    #phrase = re.sub(r'http\S+', '', phrase)  # for removal all http link
    phrase = re.sub(r'[^A-Za-z0-9]', ' ',phrase) # remove everything except alphaets and numbers
    
    sent = ''
    for val in phrase.split():          # removing multiple spaces between words
        sent = sent + ' ' + val
    sent = sent.strip()
    
    return sent
    


In [3]:
def get_sentiments(text_data):
    '''
    function to generate sentiments from text data
    '''
        
    sid = SentimentIntensityAnalyzer()
    neg=[]
    neu=[]
    pos=[]
    comp=[]
    
    for txt in text_data:
        ss = sid.polarity_scores(txt)
        neg.append(ss['neg'])
        neu.append(ss['neu'])
        pos.append(ss['pos'])
        comp.append(ss['compound'])
        
    sentiment_ = dict()
    sentiment_['neg'] = np.array(neg)[:,newaxis]
    sentiment_['neu'] = np.array(neu)[:,newaxis]
    sentiment_['pos'] = np.array(pos)[:,newaxis]
    sentiment_['comp'] = np.array(comp)[:,newaxis]

    return sentiment_

In [4]:
def countVectorizer(df_train,df_cv,column):
    '''
    function to vectorize categorical data
    '''
    # one-hot encoding 'category' feature
    vect = CountVectorizer(binary=True)
    vect.fit(df_train[column])
    encoded_train = vect.transform(df_train[column]).todense()
    encoded_cv = vect.transform(df_cv[column]).todense()
    
    return encoded_train, encoded_cv, vect

In [5]:
from scipy.stats import spearmanr
def spearman(y_true,y_pred):
    '''
    function to calculate mean spearman correlation of all 30 class-labels
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    spearman_y = []
    for i in range(30):
        score = spearmanr(y_true[:,i], y_pred[:,i] + np.random.normal(0, 1e-7, y_pred.shape[0]) , 
                                                                        nan_policy='omit').correlation
                                                                        

        spearman_y.append(score)
    mean_score = np.nanmean(spearman_y)
    return mean_score

In [14]:
def predict_LSTM(input_data):
    '''
    function to predict on raw data input
    input type should be an list/array of datapoints
    
    '''
    
    #loading saved files
    with open('../input/lstm-output/category_vectorizer', 'rb') as f:
        category_vectorizer = pickle.load(f)
        
    with open('../input/lstm-output/host_vectorizer', 'rb') as f:
        host_vectorizer = pickle.load(f)
        
    with open('../input/lstm-output/tokenizer_title', 'rb') as f:
        tokenizer_title = pickle.load(f)
        
    with open('../input/lstm-output/tokenizer_body', 'rb') as f:
        tokenizer_body = pickle.load(f)
        
    with open('../input/lstm-output/tokenizer_answer', 'rb') as f:
        tokenizer_answer = pickle.load(f)
        
    with open('../input/lstm-output/max_length', 'rb') as f:
        max_length = pickle.load(f)   
                  
    with open("../input/lstm-output/model.json", "r") as json_file:
        model_json = json_file.read()
        
    model = tf.keras.models.model_from_json(model_json)
    model.load_weights("../input/lstm-output/weights_lstm.best_copy.hdf5") 
    
    input_data = np.array(input_data) # converting input to a numpy array (if it is given as a list)

    #preprocessing host
    host = pd.Series(input_data[:,10]).apply(lambda x: x.split('.')[-2])
    host = host.apply(lambda x: x.lower())
    host = host.apply(lambda x: x.strip())
    #=================================================================================================
    # preprocessing 'category'  to lower-case and stripping leading and tailing spaces 
    category = pd.Series(input_data[:,9]).apply(lambda x: x.lower())
    category = category.apply(lambda x: x.strip())
    #=================================================================================================
    
    #encoding categorical data - 'category' and 'host'
    category_encoded = category_vectorizer.transform(category).todense()
    host_encoded = host_vectorizer.transform(host).todense()

    #=================================================================================================
    
    # preprocessing question title, question body and answers
    question_title = pd.Series(input_data[:,1]).apply(lambda x: preprocess_text(x))
    question_body = pd.Series(input_data[:,2]).apply(lambda x: preprocess_text(x))
    answer = pd.Series(input_data[:,5]).apply(lambda x: preprocess_text(x))

    #=================================================================================================
    
    # taking length of question title, question body, answer
    q_title_length = np.array(question_title.apply(lambda x: len(x.split(' '))))[:,newaxis]
    q_body_length = np.array(question_body.apply(lambda x: len(x.split(' '))))[:,newaxis]
    answer_length = np.array(answer.apply(lambda x: len(x.split(' '))))[:,newaxis]
    
    #==================================================================================================
    
    #creating sentiments features
    
    title_sentiments = get_sentiments(question_title)
    body_sentiments = get_sentiments(question_body)
    answer_sentiments = get_sentiments(answer)
    
    # encoding text data
    
    # encoding question title to sequence and paddinh with zeros
    encoded_title_test = tokenizer_title.texts_to_sequences(question_title)
    padded_title_test = pad_sequences(encoded_title_test, maxlen = max_length['title'], padding = 'post')
    
    # encoding question body to sequence and paddinh with zeros
    encoded_body_test = tokenizer_body.texts_to_sequences(question_body)
    padded_body_test = pad_sequences(encoded_body_test, maxlen = max_length['body'], padding = 'post')
    
    # encoding answer to sequence and paddinh with zeros
    encoded_answer_test = tokenizer_answer.texts_to_sequences(answer)
    padded_answer_test = pad_sequences(encoded_answer_test, maxlen = max_length['answer'], padding = 'post')
    
    
    x_test_cat_num  =  np.hstack([category_encoded,host_encoded,
                        q_title_length,q_body_length,answer_length, title_sentiments['neg'],
                              title_sentiments['neu'], title_sentiments['pos'], title_sentiments['comp'], 
                              body_sentiments['neg'],body_sentiments['neu'], body_sentiments['pos'], body_sentiments['comp'], 
                              answer_sentiments['neg'],answer_sentiments['neu'], answer_sentiments['pos'], answer_sentiments['comp']
                             ])

    x_test_cat_num = x_test_cat_num[:,:,newaxis]
    
    #======================================================================================================
    
    prediction = model.predict([padded_title_test,padded_body_test,padded_answer_test,x_test_cat_num])

    prediction = np.hstack([input_data[:,0][:,newaxis],prediction]) #giving "qa_id" as 1st column to identify the predictions
    return prediction

In [9]:
# predicting on data
train_data = pd.read_csv("../input/google-quest-challenge/train.csv") #reading  data
train_data = np.array(train_data) 
train_output = predict_LSTM(train_data)

In [10]:
# printing output from predict_CNN function
train_output

array([[0, 0.9353364706039429, 0.6771138906478882, ...,
        0.1706579029560089, 0.6841882467269897, 0.904914140701294],
       [1, 0.9555881023406982, 0.7461124658584595, ...,
        0.0507732629776001, 0.7087625861167908, 0.9104382991790771],
       [2, 0.8828686475753784, 0.45747336745262146, ...,
        0.1713494062423706, 0.6756827235221863, 0.9028689861297607],
       ...,
       [9645, 0.882625162601471, 0.48956966400146484, ...,
        0.10914880037307739, 0.19130933284759521, 0.8575698137283325],
       [9646, 0.9379218816757202, 0.6494990587234497, ...,
        0.08006134629249573, 0.7052682638168335, 0.9089163541793823],
       [9647, 0.9585661888122559, 0.782730221748352, ...,
        0.053153425455093384, 0.7830450534820557, 0.9137704372406006]],
      dtype=object)

In [12]:
# getting actual value of class-labels
actual_train = train_data[:,11:]
actual_train.shape

(6079, 30)

In [13]:
# spearman's correlation on train data
spearman(actual_train,train_output[:,1:])

0.2941428947186191