In [1]:
# importing libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle
#from transformers import 
import numpy as np
from numpy import zeros, newaxis
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm
import random as rn
from tqdm import tqdm
import tensorflow 

from wordcloud import WordCloud, STOPWORDS 

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

import tensorflow as tf
from tensorflow.keras.layers import Conv1D,AveragePooling1D,MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Input,concatenate,Activation,Dropout,BatchNormalization,LSTM
from tensorflow.keras import regularizers,Model
from tensorflow.keras.regularizers import l1,l2
from tensorflow.keras.callbacks import TensorBoard, Callback, EarlyStopping, ModelCheckpoint,LearningRateScheduler
import tensorflow_hub as hub

from transformers import BertTokenizer



In [2]:
def get_sentiments(text_data):
    '''
    function to generate sentiments from text data
    '''
        
    sid = SentimentIntensityAnalyzer()
    neg=[]
    neu=[]
    pos=[]
    comp=[]
    
    for txt in text_data:
        ss = sid.polarity_scores(txt)
        neg.append(ss['neg'])
        neu.append(ss['neu'])
        pos.append(ss['pos'])
        comp.append(ss['compound'])
        
    sentiment_ = dict()
    sentiment_['neg'] = np.array(neg)[:,newaxis]
    sentiment_['neu'] = np.array(neu)[:,newaxis]
    sentiment_['pos'] = np.array(pos)[:,newaxis]
    sentiment_['comp'] = np.array(comp)[:,newaxis]

    return sentiment_

In [7]:
from scipy.stats import spearmanr
def spearman(y_true,y_pred):
    '''
    function to calculate mean spearman correlation of all 30 class-labels
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    spearman_y = []
    for i in range(30):
        score = spearmanr(y_true[:,i], y_pred[:,i] + np.random.normal(0, 1e-7, y_pred.shape[0]) , 
                                                                        nan_policy='omit').correlation
                                                                        

        spearman_y.append(score)
    mean_score = np.nanmean(spearman_y)
    return mean_score

#=====================================================================================================================

# BERT tokenizer (pretrained)
tokenizer = BertTokenizer.from_pretrained('../input/bert-base-uncased3/bert_en_uncased_L-12_H-768_A-12_2/assets/vocab.txt')

def encode_text(text,max_len):
    
    '''
    function to encode text for input to ALBERT
    '''
    
    encoded_dict = tokenizer.encode_plus(text, None, max_length=max_len, pad_to_max_length=True,
                                       add_special_tokens=True)
    return encoded_dict
    
#================================================================================

def get_encoded_bert_inputs(title,body,answer,max_len):
    '''
    function to encode text data into BERT input form
    '''
    q_input_ids = []
    q_masks = []
    q_segments = []
    a_input_ids = []
    a_masks = []
    a_segments = []
    
    # question encoding
    for i in range(len(title)):
        q_text = title[i] + " [SEP] " + body[i]
        q_encoded_dict = encode_text(q_text,max_len)
        q_input_ids.append(q_encoded_dict['input_ids'])
        q_masks.append(q_encoded_dict['attention_mask'])
        q_segments.append(q_encoded_dict['token_type_ids'])
    
    # answer encoding
    for i in range(len(answer)):
        a_text = answer[i]
        a_encoded_dict = encode_text(a_text,max_len)
        a_input_ids.append(a_encoded_dict['input_ids'])
        a_masks.append(a_encoded_dict['attention_mask'])
        a_segments.append(a_encoded_dict['token_type_ids'])
    
    return q_input_ids, q_masks, q_segments, a_input_ids, a_masks, a_segments


In [8]:
def get_sum_of_word_vec(input_vec):
    # function to get sum of vectors for each word in a question/answer
    v = np.zeros(shape=(1,768))
    num_of_words = input_vec.shape[1]
    for j in range(num_of_words):
        v = v + input_vec[0,j,:]
    return v

#=========================================================================

def get_text_vector(bert_layer,input_ids,mask,seg):
    '''
    function to get BERT output that represents whole input sequence
    function takes the encoded output for each word and then sums up to get the final vector for each datapoint 
    '''
    
    for i in tqdm(range(len(input_ids))):
        pool, seq = bert_layer([input_ids[i:i+1],mask[i:i+1],seg[i:i+1]])
        
        seq = get_sum_of_word_vec(seq)
        
        if i>0:
            
            final_text_seq = tf.keras.layers.concatenate([final_text_seq,seq],axis=0)
        
        else:

            final_text_seq = seq
            
    return final_text_seq    

    

In [9]:
def predict(input_data):
    '''
    function to predict on raw data input
    input type should be an list/array of datapoints
    
    '''
    
    #loading saved files 
    
    with open('../input/bert-files/category_vectorizer', 'rb') as f:
        category_vectorizer = pickle.load(f)
        
    with open('../input/bert-files/host_vectorizer', 'rb') as f:
        host_vectorizer = pickle.load(f)
        
    with open("../input/bert-files/model.json", "r") as json_file:
        model_json = json_file.read()
        
    model = tf.keras.models.model_from_json(model_json)
    model.load_weights("../input/bert-files/weights_final4.best_copy.hdf5")
    
    tokenizer = BertTokenizer.from_pretrained('../input/bert-base-uncased3/bert_en_uncased_L-12_H-768_A-12_2/assets/vocab.txt')
    
    bert_layer = hub.KerasLayer("../input/bert-base-uncased3/bert_en_uncased_L-12_H-768_A-12_2", trainable=False)
    
    input_data = np.array(input_data)  # converting input to a numpy array (if it is given as a list)
    
    #preprocessing host
    host = pd.Series(input_data[:,10]).apply(lambda x: x.split('.')[-2])
    host = host.apply(lambda x: x.lower())
    host = host.apply(lambda x: x.strip())
    #=================================================================================================
    # preprocessing 'category'  to lower-case and stripping leading and tailing spaces 
    category = pd.Series(input_data[:,9]).apply(lambda x: x.lower())
    category = category.apply(lambda x: x.strip())
    #=================================================================================================
    
    #encoding categorical data - 'category' and 'host'
    category_encoded = category_vectorizer.transform(category).todense()
    host_encoded = host_vectorizer.transform(host).todense()
    
    #=================================================================================================
    
    # getting question title, question body and answers from input array 
    question_title = pd.Series(input_data[:,1])
    question_body = pd.Series(input_data[:,2])
    answer = pd.Series(input_data[:,5])

    #=================================================================================================
    
    # taking length of question title, question body, answer
    q_title_length = np.array(question_title.apply(lambda x: len(x.split(' '))))[:,newaxis]
    q_body_length = np.array(question_body.apply(lambda x: len(x.split(' '))))[:,newaxis]
    answer_length = np.array(answer.apply(lambda x: len(x.split(' '))))[:,newaxis]
    
    #creating sentiments features
    title_sentiments = get_sentiments(question_title)
    body_sentiments = get_sentiments(question_body)
    answer_sentiments = get_sentiments(answer)
    
    
    #=================================================================================================
    
    # getting inputs for BERT
    q_input_ids_test,q_mask_test,q_seg_test,a_input_ids_test,a_mask_test,a_seg_test = get_encoded_bert_inputs(question_title,question_body,answer,max_len=450)
    
    # getting vector output (representing whole input sequence) of (question_title+question_body) from BERT 
    question_vect_test = get_text_vector(bert_layer,q_input_ids_test,q_mask_test,q_seg_test)
    
    # getting vector output (representing whole inpur sequence) of answer from BERT for test data
    answer_vect_test = get_text_vector(bert_layer,a_input_ids_test,a_mask_test,a_seg_test)
    
    # concatenating question and answer output from BERT
    final_text_vector = tf.keras.layers.concatenate([question_vect_test,answer_vect_test])
    #print("Test text vector shape = ",final_text_vector_test.shape)

    vect_cat_num = np.hstack([category_encoded,host_encoded,
                        q_title_length,q_body_length,answer_length, title_sentiments['neg'],
                              title_sentiments['neu'], title_sentiments['pos'], title_sentiments['comp'], 
                              body_sentiments['neg'],body_sentiments['neu'], body_sentiments['pos'], body_sentiments['comp'], 
                              answer_sentiments['neg'],answer_sentiments['neu'], answer_sentiments['pos'], answer_sentiments['comp']
                             ])
    
    
    # reshaping data for convolution layer
    vect_cat_num = vect_cat_num[:,:,newaxis]
    
    #==============================================================================================================
    
    prediction = model.predict([final_text_vector,vect_cat_num]) 
    prediction = np.hstack([input_data[:,0][:,newaxis],prediction]) #giving "qa_id" as 1st column to identify the predictions
    return prediction

In [10]:
# prediction on train data
train_data = pd.read_csv("../input/google-quest-challenge/train.csv")
train_data = np.array(train_data)
train_output = predict(train_data)

100%|██████████| 6079/6079 [13:05<00:00,  7.74it/s]
100%|██████████| 6079/6079 [12:50<00:00,  7.89it/s]


In [11]:
# getting actual value of class-labels
actual_train = train_data[:,11:]
actual_train.shape

(6079, 30)

In [12]:
# spearman's correlation on train data
spearman(actual_train,train_output[:,1:])

0.4021423208910162