# ALBERT + CNN

In [1]:
# importing libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle
#from transformers import 
import numpy as np
from numpy import zeros, newaxis
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm
import random as rn
import tensorflow 

from wordcloud import WordCloud, STOPWORDS 

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

import tensorflow as tf
from tensorflow.keras.layers import Conv1D,AveragePooling1D,MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Input,concatenate,Activation,Dropout,BatchNormalization,LSTM
from tensorflow.keras import regularizers,Model
from tensorflow.keras.regularizers import l1,l2
from tensorflow.keras.callbacks import TensorBoard, Callback, EarlyStopping, ModelCheckpoint,LearningRateScheduler
import tensorflow_hub as hub

from transformers import AlbertTokenizer, BertTokenizer



In [2]:
train = pd.read_csv("../input/google-quest-challenge/train.csv")

In [3]:

def get_sentiments(df,column_list):
    '''
    sentiments from question_title
    '''
     
    for col in column_list:
        
        sid = SentimentIntensityAnalyzer()
        neg=[]
        neu=[]
        pos=[]
        comp=[]
    
        for txt in df[col]:
            ss = sid.polarity_scores(txt)
            neg.append(ss['neg'])
            neu.append(ss['neu'])
            pos.append(ss['pos'])
            comp.append(ss['compound'])

        df[col+"_neg"] = neg
        df[col+"_neu"] = neu
        df[col+"_pos"] = pos
        df[col+"_comp"] = comp
    
    return df

#==================================================

def countVectorizer(df_train,df_cv,column):
    '''
    function to vectorize categorical data
    '''
    # one-hot encoding 'category' feature
    vect = CountVectorizer(binary=True)
    vect.fit(df_train[column])
    encoded_train = vect.transform(df_train[column]).todense()
    encoded_cv = vect.transform(df_cv[column]).todense()
    
    return encoded_train, encoded_cv, vect

In [4]:
from scipy.stats import spearmanr
def spearman(y_true,y_pred):
    '''
    function to calculate mean spearman correlation of all 30 class-labels
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    spearman_y = []
    for i in range(30):
        score = spearmanr(y_true[:,i], y_pred[:,i] + np.random.normal(0, 1e-7, y_pred.shape[0]) , 
                                                                        nan_policy='omit').correlation
                                                                        

        spearman_y.append(score)
    mean_score = np.nanmean(spearman_y)
    return mean_score

In [5]:
#=================================PREPROCESSING===================================
    
# preprocessing host name 
train['host'] = train['host'].apply(lambda x: x.split('.')[-2])
train['host'] = train['host'].apply(lambda x: x.lower())
train['host'] = train['host'].apply(lambda x: x.strip())
    
# preprocessing 'category' to lower-case and stripping leading and tailing spaces 
train['category'] = train['category'].apply(lambda x: x.lower())
train['category'] = train['category'].apply(lambda x: x.strip())
    
#=================================FEATURE ENGINEERING=============================
    
# taking length of question title, question body, answer
train['q_title_length'] = train['question_title'].apply(lambda x: len(x.split(' ')))
train['q_body_length'] = train['question_body'].apply(lambda x: len(x.split(' ')))
train['answer_length'] = train['answer'].apply(lambda x: len(x.split(' ')))
#train.head()
    
train = get_sentiments(train,['question_title','question_body','answer'])

In [6]:
# separation of features and class-labels
class_labels = ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 
                    'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 
                    'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 
                    'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 
                    'question_type_compare', 'question_type_consequence', 'question_type_definition', 
                    'question_type_entity', 'question_type_instructions', 'question_type_procedure', 
                    'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
                    'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 
                    'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 
                    'answer_type_reason_explanation', 'answer_well_written']

X = train.drop(columns=class_labels,axis=1)
y = train[class_labels]

print(X.shape, y.shape)

(6079, 26) (6079, 30)


In [7]:
# splitting data
x_train,x_cv,y_train,y_cv = train_test_split(X,y,test_size=0.05,stratify = X['category'])
print("Train shape: ", x_train.shape,y_train.shape)
print("Test shape: ", x_cv.shape,y_cv.shape)

Train shape:  (5775, 26) (5775, 30)
Test shape:  (304, 26) (304, 30)


In [8]:
# Encoding categorical data
category_encoded_train, category_encoded_cv, category_vectorizer = countVectorizer(x_train,x_cv,'category')
host_encoded_train, host_encoded_cv, host_vectorizer = countVectorizer(x_train,x_cv,'host')

print(category_encoded_train.shape)
print(host_encoded_train.shape)

(5775, 5)
(5775, 6)


In [10]:
# BERT tokenizer (pretrained)
tokenizer = BertTokenizer.from_pretrained('../input/bert-base-uncased3/bert_en_uncased_L-12_H-768_A-12_2/assets/vocab.txt')


def encode_text(text,max_len):
    
    '''
    function to encode text for input to ALBERT
    '''
    
    encoded_dict = tokenizer.encode_plus(text, None, max_length=max_len, pad_to_max_length=True,
                                       add_special_tokens=True)
    return encoded_dict
    
#================================================================================

def get_encoded_bert_inputs(df,max_len):
    '''
    function to encode text data into BERT input form
    '''
    q_input_ids = []
    q_masks = []
    q_segments = []
    a_input_ids = []
    a_masks = []
    a_segments = []
    
    # question encoding
    for i in df.index:
        q_text = df['question_title'][i] + " [SEP] " + df['question_body'][i]
        q_encoded_dict = encode_text(q_text,max_len)
        q_input_ids.append(q_encoded_dict['input_ids'])
        q_masks.append(q_encoded_dict['attention_mask'])
        q_segments.append(q_encoded_dict['token_type_ids'])
    
    # answer encoding
    for i in df.index:
        a_text = df['answer'][i]
        a_encoded_dict = encode_text(a_text,max_len)
        a_input_ids.append(a_encoded_dict['input_ids'])
        a_masks.append(a_encoded_dict['attention_mask'])
        a_segments.append(a_encoded_dict['token_type_ids'])
    
    return q_input_ids, q_masks, q_segments, a_input_ids, a_masks, a_segments


In [11]:
# getting inputs for BERT
q_input_ids,q_mask,q_seg,a_input_ids,a_mask,a_seg = get_encoded_bert_inputs(x_train,max_len=450)
q_input_ids_cv,q_mask_cv,q_seg_cv,a_input_ids_cv,a_mask_cv,a_seg_cv = get_encoded_bert_inputs(x_cv,max_len=450)

In [12]:
# defining base BERT layer
bert_layer = hub.KerasLayer("../input/bert-base-uncased3/bert_en_uncased_L-12_H-768_A-12_2",
                              trainable=False)

In [13]:
def get_sum_of_word_vec(input_vec):
    # function to get sum of vectors for each word in a question/answer
    v = np.zeros(shape=(1,768))
    num_of_words = input_vec.shape[1]
    for j in range(num_of_words):
        v = v + input_vec[0,j,:]
    return v

#=========================================================================

from tqdm import tqdm

def get_text_vector(albert_layer,input_ids,mask,seg):
    '''
    function to get BERT output that represents whole input sequence
    function takes the encoded output for each word and then sums up to get the final vector for each datapoint 
    '''
    
    for i in tqdm(range(len(input_ids))):
        pool, seq = albert_layer([input_ids[i:i+1],mask[i:i+1],seg[i:i+1]])
        
        seq = get_sum_of_word_vec(seq)
        
        if i>0:
            
            final_text_seq = tf.keras.layers.concatenate([final_text_seq,seq],axis=0)
        
        else:
            final_text_seq = seq
            
    return final_text_seq
    

    

In [14]:
# getting vector output (representing whole input sequence) of question from ALBERT for train data
question_vect = get_text_vector(bert_layer,q_input_ids,q_mask,q_seg)

100%|██████████| 5775/5775 [12:15<00:00,  7.86it/s]


In [17]:
# getting vector output (representing whole inpur sequence) of answer from ALBERT for train data
answer_vect = get_text_vector(bert_layer,a_input_ids,a_mask,a_seg)

100%|██████████| 5775/5775 [12:08<00:00,  7.93it/s]


In [19]:
# getting vector output (representing whole inpur sequence) of answer from ALBERT for cross-validation data
question_vect_cv = get_text_vector(bert_layer,q_input_ids_cv,q_mask_cv,q_seg_cv)

100%|██████████| 304/304 [00:38<00:00,  7.90it/s]


In [21]:
# getting vector output (representing whole inpur sequence) of answer from ALBERT for cross-validation data
answer_vect_cv = get_text_vector(bert_layer,a_input_ids_cv,a_mask_cv,a_seg_cv)

100%|██████████| 304/304 [00:38<00:00,  7.86it/s]


In [23]:
# concatenating question and answer output from ALBERT
final_text_vector_train = tf.keras.layers.concatenate([question_vect,answer_vect])
final_text_vector_cv = tf.keras.layers.concatenate([question_vect_cv,answer_vect_cv])

print("Train text vector shape = ",final_text_vector_train.shape)
print("CV text vector shape = ",final_text_vector_cv.shape)

Train text vector shape =  (5775, 1536)
CV text vector shape =  (304, 1536)


In [25]:
# concatenating and reshaping categorical and numerical data for train and cross-validation

train_vect_cat_num = np.hstack([category_encoded_train,host_encoded_train,
                                    x_train[['q_title_length','q_body_length','answer_length', 'question_title_neg',
                                             'question_title_neu', 'question_title_pos', 'question_title_comp', 
                                             'question_body_neg','question_body_neu', 'question_body_pos', 'question_body_comp', 
                                             'answer_neg','answer_neu', 'answer_pos', 'answer_comp']]])

cv_vect_cat_num = np.hstack([category_encoded_cv,host_encoded_cv,
                                 x_cv[['q_title_length','q_body_length','answer_length', 'question_title_neg',
                                       'question_title_neu', 'question_title_pos', 'question_title_comp', 
                                       'question_body_neg','question_body_neu', 'question_body_pos', 'question_body_comp', 
                                       'answer_neg','answer_neu', 'answer_pos', 'answer_comp']]])
train_vect_cat_num = train_vect_cat_num[:,:,newaxis]
cv_vect_cat_num = cv_vect_cat_num[:,:,newaxis]
print(train_vect_cat_num.shape, cv_vect_cat_num.shape)

(5775, 26, 1) (304, 26, 1)


In [77]:
def cnn_model(train_vect_text, train_vect_cat_num):
    '''
    function to create CNN Model architecture
    '''

    #fixing numpy RS
    np.random.seed(42)

    #fixing tensorflow RS
    tensorflow.random.set_seed(32)
    
    #python RS
    rn.seed(12)

    #input 1
    input1 = Input(shape=(train_vect_text.shape[1],1), name = 'input_1')
    conv = Conv1D(filters = 32, kernel_size = 7,strides = 1, activation='relu',
                  kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43),
                  kernel_regularizer=tensorflow.keras.regularizers.l2(0.6),
                  input_shape=(None,train_vect_text.shape[1],1))(input1)
    conv = MaxPooling1D()(conv)
    
    conv = Conv1D(16, 7, activation='relu',kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43),
                  kernel_regularizer=tensorflow.keras.regularizers.l2(0.6))(conv)
    conv = MaxPooling1D()(conv)
    flat_text = Flatten()(conv)
    out_1 =  Dense(768,activation='sigmoid')(flat_text)
    #====================================================================================================================

    #input 2
    input2 = Input(shape=(train_vect_cat_num.shape[1],1), name = 'input_2')
    conv = Conv1D(filters = 64, kernel_size = 3,strides = 1, activation='relu',
                  kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43),
                  kernel_regularizer=tensorflow.keras.regularizers.l2(0.2),
                  input_shape=(None,train_vect_cat_num.shape[1],1))(input2)

    conv = MaxPooling1D()(conv)
    conv = Conv1D(32, 3,  activation='relu',kernel_initializer=tensorflow.keras.initializers.he_normal(seed=43))(conv)
    conv = MaxPooling1D()(conv)
    flat_rem = Flatten()(conv)
    out_2 =  Dense(48,activation='sigmoid')(flat_rem)
    #====================================================================================================================

    final_data = concatenate([out_1,out_2])

    #====================================================================================================================

    output = Dense(30,activation='sigmoid',kernel_initializer=tensorflow.keras.initializers.glorot_uniform(seed=45))(final_data)

    # create model with 2 inputs
    model = Model([input1,input2], output)

    model.compile(loss=tensorflow.keras.losses.binary_crossentropy,
              optimizer=tensorflow.keras.optimizers.Adam(0.001),
              metrics=['mae'])
    
    return model

In [78]:
# creating a CNN model 
model = cnn_model(final_text_vector_train,train_vect_cat_num)
model.summary()

Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1536, 1)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 26, 1)]      0                                            
__________________________________________________________________________________________________
conv1d_52 (Conv1D)              (None, 1530, 32)     256         input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_54 (Conv1D)              (None, 24, 64)       256         input_2[0][0]                    
___________________________________________________________________________________________

In [79]:
# checkpoint model for best weights
filepath="weights_zn.best_copy.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='min')

# fitting model on train data
history_model = model.fit([final_text_vector_train,train_vect_cat_num], y_train, 
                                 epochs=35,
                                 verbose=2,batch_size=32,
                                 callbacks = [checkpoint],
                                 validation_split = 0.05)



Epoch 1/35
172/172 - 1s - loss: 56.2657 - mae: 0.1750 - val_loss: 40.4854 - val_mae: 0.1730
Epoch 2/35
172/172 - 1s - loss: 32.5068 - mae: 0.1722 - val_loss: 25.8240 - val_mae: 0.1725
Epoch 3/35
172/172 - 1s - loss: 21.0699 - mae: 0.1730 - val_loss: 16.8953 - val_mae: 0.1741
Epoch 4/35
172/172 - 1s - loss: 13.8182 - mae: 0.1717 - val_loss: 11.1115 - val_mae: 0.1699
Epoch 5/35
172/172 - 1s - loss: 9.1021 - mae: 0.1607 - val_loss: 7.3264 - val_mae: 0.1523
Epoch 6/35
172/172 - 1s - loss: 6.0271 - mae: 0.1507 - val_loss: 4.8744 - val_mae: 0.1510
Epoch 7/35
172/172 - 1s - loss: 4.0209 - mae: 0.1477 - val_loss: 3.2739 - val_mae: 0.1518
Epoch 8/35
172/172 - 1s - loss: 2.7177 - mae: 0.1453 - val_loss: 2.2325 - val_mae: 0.1489
Epoch 9/35
172/172 - 1s - loss: 1.8706 - mae: 0.1435 - val_loss: 1.5621 - val_mae: 0.1489
Epoch 10/35
172/172 - 1s - loss: 1.3238 - mae: 0.1426 - val_loss: 1.1288 - val_mae: 0.1519
Epoch 11/35
172/172 - 1s - loss: 0.9711 - mae: 0.1415 - val_loss: 0.8489 - val_mae: 0.1483


In [80]:
# loading best weights for prediction
model.load_weights("weights_zn.best_copy.hdf5")
    
# predicting on train data and calculating spearman correlation 
y_tr_pred = model.predict([final_text_vector_train,train_vect_cat_num])
print("Spearman's correlation on train data = ",spearman(y_train,y_tr_pred))
    
# predicting on cross-validation data and calculating spearman correlation 
y_cv_pred = model.predict([final_text_vector_cv,cv_vect_cat_num])
print("Spearman's correlation on cross-validation data = ",spearman(y_cv,y_cv_pred))
    


Spearman's correlation on train data =  0.4076806249634812
Spearman's correlation on cross-validation data =  0.3618157991884826


# TEST

In [None]:
test = pd.read_csv("../input/google-quest-challenge/test.csv")

In [None]:
#=================================PREPROCESSING===================================
    
# preprocessing host name 
test['host'] = test['host'].apply(lambda x: x.split('.')[-2])
test['host'] = test['host'].apply(lambda x: x.lower())
test['host'] = test['host'].apply(lambda x: x.strip())
    
# preprocessing 'category' to lower-case and stripping leading and tailing spaces 
test['category'] = test['category'].apply(lambda x: x.lower())
test['category'] = test['category'].apply(lambda x: x.strip())
    
#=================================FEATURE ENGINEERING=============================
    
# taking length of question title, question body, answer
test['q_title_length'] = test['question_title'].apply(lambda x: len(x.split(' ')))
test['q_body_length'] = test['question_body'].apply(lambda x: len(x.split(' ')))
test['answer_length'] = test['answer'].apply(lambda x: len(x.split(' ')))
#train.head()
    
test = get_sentiments(test,['question_title','question_body','answer'])

In [None]:
#encoding categorical data - 'category' and 'host'
category_encoded_test = category_vectorizer.transform(test['category']).todense()
host_encoded_test = host_vectorizer.transform(test['host']).todense()


In [None]:
# getting inputs for BERT
q_input_ids_test,q_mask_test,q_seg_test,a_input_ids_test,a_mask_test,a_seg_test = get_encoded_bert_inputs(test,max_len=450)

In [None]:
# getting vector output (representing whole input sequence) of question from BERT for test data
question_vect_test = get_text_vector(bert_layer,q_input_ids_test,q_mask_test,q_seg_test)

In [None]:
# getting vector output (representing whole inpur sequence) of answer from BERT for test data
answer_vect_test = get_text_vector(bert_layer,a_input_ids_test,a_mask_test,a_seg_test)

In [None]:
# concatenating question and answer output from BERT
final_text_vector_test = tf.keras.layers.concatenate([question_vect_test,answer_vect_test])

print("Test text vector shape = ",final_text_vector_test.shape)


In [None]:
# concatenating and reshaping categorical and numerical data for train and cross-validation

test_vect_cat_num = np.hstack([category_encoded_test,host_encoded_test,
                                    test[['q_title_length','q_body_length','answer_length', 'question_title_neg',
                                             'question_title_neu', 'question_title_pos', 'question_title_comp', 
                                             'question_body_neg','question_body_neu', 'question_body_pos', 'question_body_comp', 
                                             'answer_neg','answer_neu', 'answer_pos', 'answer_comp']]])

test_vect_cat_num = test_vect_cat_num[:,:,newaxis]

print(test_vect_cat_num.shape)

In [None]:
y_test_pred = model.predict([final_text_vector_test,test_vect_cat_num])

In [None]:
submission = pd.DataFrame()
submission['qa_id'] = test['qa_id']
for i,col in enumerate(class_labels):
    submission[col] = y_test_pred[:,i]
submission.shape

In [None]:
submission.to_csv("submission.csv",index=False)