# CNN

In [1]:
# importing libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle
#from transformers import 
import numpy as np
from numpy import zeros, newaxis
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm
import random as rn
import tensorflow 

from wordcloud import WordCloud, STOPWORDS 

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

import tensorflow as tf
from tensorflow.keras.layers import Conv1D,AveragePooling1D,MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Input,concatenate,Activation,Dropout,BatchNormalization,LSTM
from tensorflow.keras import regularizers,Model
from tensorflow.keras.regularizers import l1,l2
from tensorflow.keras.callbacks import TensorBoard, Callback, EarlyStopping, ModelCheckpoint,LearningRateScheduler


from matplotlib_venn import venn2, venn2_unweighted
from matplotlib_venn import venn3, venn3_unweighted
%matplotlib inline

In [2]:
# https://stackoverflow.com/a/47091490/4084039
import re

def preprocess_text(phrase):
    # specific
    phrase = str(phrase)
    phrase = phrase.lower()
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrade = re.sub(r"wont", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"cant", "can not", phrase)
    phrase = re.sub(r"cannot", "can not", phrase)
    phrase = re.sub(r"doesn't", "does not", phrase)
    
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    phrase = phrase.replace('\\r', ' ')
    phrase = phrase.replace('\\"', ' ')
    phrase = phrase.replace('\\n', ' ')
    phrase = phrase.replace('\\', ' ')
    
    #phrase = re.sub(r'http\S+', '', phrase)  # for removal all http link
    phrase = re.sub(r'[^A-Za-z0-9]', ' ',phrase) # remove everything except alphaets and numbers
    
    sent = ''
    for val in phrase.split():          # removing multiple spaces between words
        sent = sent + ' ' + val
    sent = sent.strip()
    
    return sent
    


In [4]:
def get_sentiments(text_data):
    '''
    function to generate sentiments from text data
    '''
        
    sid = SentimentIntensityAnalyzer()
    neg=[]
    neu=[]
    pos=[]
    comp=[]
    
    for txt in text_data:
        ss = sid.polarity_scores(txt)
        neg.append(ss['neg'])
        neu.append(ss['neu'])
        pos.append(ss['pos'])
        comp.append(ss['compound'])
        
    sentiment_ = dict()
    sentiment_['neg'] = np.array(neg)[:,newaxis]
    sentiment_['neu'] = np.array(neu)[:,newaxis]
    sentiment_['pos'] = np.array(pos)[:,newaxis]
    sentiment_['comp'] = np.array(comp)[:,newaxis]

    return sentiment_

In [5]:
def cnn_model(train_vect_text, train_vect_cat_num):
    '''
    function to create CNN Model architecture
    '''

    #fixing numpy RS
    np.random.seed(42)

    #fixing tensorflow RS
    tensorflow.random.set_seed(32)
    
    #python RS
    rn.seed(12)

    #input 1
    input1 = Input(shape=(train_vect_text.shape[1],1), name = 'input_1')
    conv = Conv1D(filters = 32, kernel_size = 11,strides = 3, activation='relu',
                  kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43),
                  kernel_regularizer=tensorflow.keras.regularizers.l2(0.1),
                  input_shape=(None,train_vect_text.shape[1],1))(input1)
    conv = MaxPooling1D()(conv)
    conv = Conv1D(32, 11, activation='relu',kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43))(conv)
    conv = MaxPooling1D()(conv)
    flat_text = Flatten()(conv)
    out_1 =  Dense(30,activation='sigmoid')(flat_text)
    #====================================================================================================================

    #input 2
    input2 = Input(shape=(train_vect_cat_num.shape[1],1), name = 'input_2')
    conv = Conv1D(filters = 64, kernel_size = 3,strides = 2, activation='relu',
                  kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43),
                  kernel_regularizer=tensorflow.keras.regularizers.l2(0.1),
                  input_shape=(None,train_vect_cat_num.shape[1],1))(input2)

    conv = MaxPooling1D()(conv)
    conv = Conv1D(32, 3,  activation='relu',kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43))(conv)
    conv = MaxPooling1D()(conv)
    flat_rem = Flatten()(conv)
    out_2 =  Dense(30,activation='sigmoid')(flat_rem)
    #====================================================================================================================

    final_data = concatenate([out_1,out_2])

    #====================================================================================================================

    output = Dense(30,activation='sigmoid',kernel_initializer=tensorflow.keras.initializers.glorot_uniform(seed=45))(final_data)

    # create model with 2 inputs
    model = Model([input1,input2], output)

    model.compile(loss=tensorflow.keras.losses.binary_crossentropy,
              optimizer=tensorflow.keras.optimizers.Adam(0.001),
              metrics=['mae'])
    
    return model

In [6]:
# Reading glove vectors in python: https://stackoverflow.com/a/38230349/4084039
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r', encoding="utf8")
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model


In [7]:
def countVectorizer(df_train,df_cv,column):
    '''
    function to vectorize categorical data
    '''
    # one-hot encoding 'category' feature
    vect = CountVectorizer(binary=True)
    vect.fit(df_train[column])
    encoded_train = vect.transform(df_train[column]).todense()
    encoded_cv = vect.transform(df_cv[column]).todense()
    
    return encoded_train, encoded_cv, vect

In [9]:
def get_average_w2v(text_data,glove_vector,glove_words):
    '''
    function to vectorize text data into average word2vec
    '''    
    # computing average word2vec for each question title
    avg_w2v_vectors = []; # the avg-w2v for each question title is stored in this list
    for sentence in tqdm(text_data): # for each question title
        vector = np.zeros(300) # as word vectors are of zero length
        cnt_words = 0; # num of words with a valid vector in the question title
        for word in sentence.split(): # for each word in a question title
            if word in glove_words:
                vector += glove_vector[word]
                cnt_words += 1
        if cnt_words != 0:
            vector /= cnt_words
        avg_w2v_vectors.append(vector)
        
    return avg_w2v_vectors


In [10]:
from scipy.stats import spearmanr
def spearman(y_true,y_pred):
    '''
    function to calculate mean spearman correlation of all 30 class-labels
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    spearman_y = []
    for i in range(30):
        score = spearmanr(y_true[:,i], y_pred[:,i] + np.random.normal(0, 1e-7, y_pred.shape[0]) , 
                                                                        nan_policy='omit').correlation
                                                                        

        spearman_y.append(score)
    mean_score = np.nanmean(spearman_y)
    return mean_score

In [37]:
# List/array input
def predict_CNN(input_data):
    '''
    function to predict on raw data input
    input type should be an list/array of datapoints
    
    '''
    
    #loading saved files 
    # reading glove_vectors pickle file
    with open('glove_vectors', 'rb') as f:
        glove_vector = pickle.load(f)
        glove_words =  set(glove_vector.keys())
    
    with open('category_vectorizer', 'rb') as f:
        category_vectorizer = pickle.load(f)
        
    with open('host_vectorizer', 'rb') as f:
        host_vectorizer = pickle.load(f)
        
    with open("model.json", "r") as json_file:
        model_json = json_file.read()
        
    model = tf.keras.models.model_from_json(model_json)
    model.load_weights("weights_cnn.best_copy.hdf5")
    
    input_data = np.array(input_data) # converting input to a numpy array (if it is given as a list)

    #preprocessing host
    host = pd.Series(input_data[:,10]).apply(lambda x: x.split('.')[-2])
    host = host.apply(lambda x: x.lower())
    host = host.apply(lambda x: x.strip())
    #=================================================================================================
    # preprocessing 'category'  to lower-case and stripping leading and tailing spaces 
    category = pd.Series(input_data[:,9]).apply(lambda x: x.lower())
    category = category.apply(lambda x: x.strip())
    #=================================================================================================
    
    #encoding categorical data - 'category' and 'host'
    category_encoded = category_vectorizer.transform(category).todense()
    host_encoded = host_vectorizer.transform(host).todense()

    #=================================================================================================
    
    # preprocessing question title, question body and answers
    question_title = pd.Series(input_data[:,1]).apply(lambda x: preprocess_text(x))
    question_body = pd.Series(input_data[:,2]).apply(lambda x: preprocess_text(x))
    answer = pd.Series(input_data[:,5]).apply(lambda x: preprocess_text(x))

    #=================================================================================================
    
    # taking length of question title, question body, answer
    q_title_length = np.array(question_title.apply(lambda x: len(x.split(' '))))[:,newaxis]
    q_body_length = np.array(question_body.apply(lambda x: len(x.split(' '))))[:,newaxis]
    answer_length = np.array(answer.apply(lambda x: len(x.split(' '))))[:,newaxis]
    
    #==================================================================================================
    
    #creating sentiments features
    
    title_sentiments = get_sentiments(question_title)
    body_sentiments = get_sentiments(question_body)
    answer_sentiments = get_sentiments(answer)
    
    # encoding text data
    avg_w2v_vectors_title = get_average_w2v(question_title,glove_vector,glove_words)
    avg_w2v_vectors_body = get_average_w2v(question_body,glove_vector,glove_words)
    avg_w2v_vectors_answer = get_average_w2v(answer,glove_vector,glove_words)

   #=========================================================================================

    #concatenating final encoded data
    
    vect_text = np.hstack([avg_w2v_vectors_title,avg_w2v_vectors_body,
                        avg_w2v_vectors_answer])

    vect_cat_num = np.hstack([category_encoded,host_encoded,
                        q_title_length,q_body_length,answer_length, title_sentiments['neg'],
                              title_sentiments['neu'], title_sentiments['pos'], title_sentiments['comp'], 
                              body_sentiments['neg'],body_sentiments['neu'], body_sentiments['pos'], body_sentiments['comp'], 
                              answer_sentiments['neg'],answer_sentiments['neu'], answer_sentiments['pos'], answer_sentiments['comp']
                             ])
    
    
    # reshaping data for convolution layer
    vect_text = vect_text[:,:,newaxis]
    vect_cat_num = vect_cat_num[:,:,newaxis]
    
    #====================================================================================================
    
    #predicting on input data 
    prediction = model.predict([vect_text,vect_cat_num])
    prediction = np.hstack([input_data[:,0][:,newaxis],prediction]) #giving "qa_id" as 1st column to identify the predictions
    return prediction

In [38]:
train_data = pd.read_csv("google-quest-challenge/train.csv")
train_data = np.array(train_data)
train_output = predict_CNN(train_data)

100%|███████████████████████████████████████████████████████████████████████████| 6079/6079 [00:00<00:00, 15873.94it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6079/6079 [00:03<00:00, 1534.66it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6079/6079 [00:04<00:00, 1455.14it/s]


In [39]:
# printing output from predict_CNN function
train_output

array([[0, 0.9206799864768982, 0.6444732546806335, ...,
        0.12324012070894241, 0.7039588093757629, 0.9255793690681458],
       [1, 0.9051656126976013, 0.6512964963912964, ...,
        0.10090433806180954, 0.5737784504890442, 0.9125009179115295],
       [2, 0.880452573299408, 0.5040979981422424, ...,
        0.14553439617156982, 0.5804992914199829, 0.921263575553894],
       ...,
       [9645, 0.857827365398407, 0.47633975744247437, ...,
        0.0990651324391365, 0.30216357111930847, 0.8806634545326233],
       [9646, 0.921221911907196, 0.6582242846488953, ...,
        0.09240725636482239, 0.708876371383667, 0.9302746057510376],
       [9647, 0.9378359913825989, 0.7525654435157776, ...,
        0.09269876778125763, 0.7716101408004761, 0.9221950769424438]],
      dtype=object)

In [40]:
# getting actual value of class-labels
actual_train = train_data[:,11:]
actual_train.shape

(6079, 30)

In [41]:
# spearman's correlation on train data
spearman(actual_train,train_output[:,1:])

0.35904301041451253