In [148]:
# Importing libraries to be needed
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import hstack
from keras.layers import Dense
from gensim.models import Word2Vec
from keras.models import Sequential
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [149]:
# Load data from csv file
data = pd.read_csv('./data/train.csv')

In [150]:
data.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [151]:
# Finding if any null value present.
data.isna().any()

qa_id                                    False
question_title                           False
question_body                            False
question_user_name                       False
question_user_page                       False
answer                                   False
answer_user_name                         False
answer_user_page                         False
url                                      False
category                                 False
host                                     False
question_asker_intent_understanding      False
question_body_critical                   False
question_conversational                  False
question_expect_short_answer             False
question_fact_seeking                    False
question_has_commonly_accepted_answer    False
question_interestingness_others          False
question_interestingness_self            False
question_multi_intent                    False
question_not_really_a_question           False
question_opin

In [152]:
# Following are the target of this model.
target=['question_asker_intent_understanding','question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking','question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self','question_multi_intent',
       'question_not_really_a_question','question_opinion_seeking', 'question_type_choice',
       'question_type_compare','question_type_consequence','question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure','question_type_reason_explanation',
       'question_type_spelling','question_well_written',
       'answer_helpful','answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions','answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

# These columns needs to be dropped from the dataset because it is not needed.
drop_cols=['qa_id','question_user_name','question_user_page','answer_user_name','answer_user_page','url']

In [153]:
# Drop irrelevant columns from dataset.
data.drop(drop_cols,inplace=True,axis=1)
data['question']=data['question_title']+data['question_body']
data.drop(['question_title','question_body'],axis=1,inplace=True)
data.columns

Index(['answer', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written', 'question'],
    

In [154]:
data=data[['question','answer', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']]

In [155]:
# Processing qustions and answers removing all special characters.
import re
questions=[]
answers=[]
for sent in data['question']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    questions.append(sent.strip())
for sent in data['answer']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    answers.append(sent.strip())
    
data['questions']=questions
data['answers']=answers
data.head(2)

Unnamed: 0,question,answer,category,host,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,...,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written,questions,answers
0,What am I losing when using extension tubes in...,"I just got extension tubes, so here's the skin...",LIFE_ARTS,photo.stackexchange.com,1.0,0.333333,0.0,0.0,0.0,0.0,...,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0,What am I losing when using extension tubes in...,I just got extension tubes so here s the skinn...
1,What is the distinction between a city and a s...,It might be helpful to look into the definitio...,CULTURE,rpg.stackexchange.com,1.0,1.0,0.0,0.5,1.0,1.0,...,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889,What is the distinction between a city and a s...,It might be helpful to look into the definitio...


In [156]:
cols=['question','answer', 'category', 'host']

In [157]:
# Getting target out of base dataset into seperate dataset Y.
Y=data[target]

# Removing target from base dataset.
data.drop(target,axis=1,inplace=True)

# Splitting dataset into test and train data for future use.
X_train,X_test,y_train,y_test=train_test_split(data,Y,test_size=0.20)

# Furthur splitting training data into training and validation data for better results.
X_train,X_cv,y_train,y_cv=train_test_split(X_train,y_train,test_size=0.20)

In [158]:
# Getting list of unique hosts (platforms) where quesitons were asked from train data.
host_list=list(X_train['host'].value_counts().index)

# Appending index of host from host list to another array for future use using train data.
train_host_encoded=[]
for i in X_train['host']:
    if i in host_list:
        train_host_encoded.append(host_list.index(i))
    else:
        train_host_encoded.append(-1)

# Appending index of host from host list to another array for future use using test data.
test_host_encoded=[]
for i in X_test['host']:
    if i in host_list:
        test_host_encoded.append(host_list.index(i))
    else:
        test_host_encoded.append(-1)

# Appending index of host from host list to another array for future use using validation data.
cv_host_encoded=[]
for i in X_cv['host']:
    if i in host_list:
        cv_host_encoded.append(host_list.index(i))
    else:
        cv_host_encoded.append(-1)

In [159]:
# Getting list of unique categories of questions from train data.
cat_list=list(X_train['category'].value_counts().index)

# Appending index of categories from caegories list to another array for future use using train data.
train_cat_encoded=[]
for i in X_train['category']:
    if i in cat_list:
        train_cat_encoded.append(cat_list.index(i))
    else:
        train_cat_encoded.append(-1)

# Appending index of categories from caegories list to another array for future use using test data.
test_cat_encoded=[]
for i in X_test['category']:
    if i in cat_list:
        test_cat_encoded.append(cat_list.index(i))
    else:
        test_cat_encoded.append(-1)

# Appending index of categories from caegories list to another array for future use using validation data.
cv_cat_encoded=[]
for i in X_cv['category']:
    if i in cat_list:
        cv_cat_encoded.append(cat_list.index(i))
    else:
        cv_cat_encoded.append(-1)

In [160]:
# Making a word corpus using words used questions and answers  
word_corpus=[]
for sentence in data['question']:
    sent=[]
    for w in sentence.split():
        sent.append(w)
    word_corpus.append(sent)
for sentence in data['answer']:
    sent=[]
    for w in sentence.split():
        sent.append(w)
    word_corpus.append(sent)

In [161]:
# Word2vec is a technique for natural language processing. 
# The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text.
# min_count -> frequecy of word occurace
# size -> the dimensionality of the vector.
w2vmodel=Word2Vec(word_corpus, min_count=4,size=300)

In [162]:
# tqdm -> Instantly make your loops show a smart progress meter - just wrap any iterable with
# Using Word2vec to make vector of word association for every question in training data
w2v_question_train = []
for sentence in tqdm(X_train['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_train.append(vector)

print(len(w2v_question_train))
print(len(w2v_question_train[0]))
print(type(w2v_question_train))

100%|██████████| 3890/3890 [00:07<00:00, 509.77it/s]

3890
300
<class 'list'>





In [163]:
# Using Word2vec to make vector of word association for every question in test data
w2v_question_test = []
for sentence in tqdm(X_test['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_test.append(vector)

print(len(w2v_question_test))
print(len(w2v_question_test[0]))
print(type(w2v_question_test))

100%|██████████| 1216/1216 [00:03<00:00, 382.98it/s]

1216
300
<class 'list'>





In [164]:
# Using Word2vec to make vector of word association for every question in validation data
w2v_question_cv = []
for sentence in tqdm(X_cv['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_cv.append(vector)

print(len(w2v_question_cv))
print(len(w2v_question_cv[0]))
print(type(w2v_question_cv))

100%|██████████| 973/973 [00:01<00:00, 534.64it/s]

973
300
<class 'list'>





In [165]:
# Using Word2vec to make vector of word association for every answer in train data
w2v_answer_train = []
for sentence in tqdm(X_train['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_train.append(vector)

print(len(w2v_answer_train))
print(len(w2v_answer_train[0]))
print(type(w2v_answer_train))

100%|██████████| 3890/3890 [00:06<00:00, 563.46it/s]

3890
300
<class 'list'>





In [166]:
# Using Word2vec to make vector of word association for every answer in test data
w2v_answer_test = []
for sentence in tqdm(X_test['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_test.append(vector)

print(len(w2v_answer_test))
print(len(w2v_answer_test[0]))
print(type(w2v_answer_test))

100%|██████████| 1216/1216 [00:02<00:00, 541.12it/s]

1216
300
<class 'list'>





In [167]:
# Using Word2vec to make vector of word association for every answer in validation data
w2v_answer_cv = []
for sentence in tqdm(X_cv['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_cv.append(vector)

print(len(w2v_answer_cv))
print(len(w2v_answer_cv[0]))
print(type(w2v_answer_cv))

100%|██████████| 973/973 [00:01<00:00, 551.28it/s]

973
300
<class 'list'>





In [168]:
# Making multidimensional array using above calculated data for train, test and validation data
# The data used in multidimensional array is Word2vec output for question and answer and encoded host and category

X_tr=np.hstack((w2v_question_train,w2v_answer_train,np.array(train_host_encoded).reshape(-1,1),np.array(train_cat_encoded).reshape(-1,1)))

X_te=np.hstack((w2v_question_test,w2v_answer_test,np.array(test_host_encoded).reshape(-1,1),np.array(test_cat_encoded).reshape(-1,1)))

X_cv=np.hstack((w2v_question_cv,w2v_answer_cv,np.array(cv_host_encoded).reshape(-1,1),np.array(cv_cat_encoded).reshape(-1,1)))

In [23]:
y_train=y_train.values
y_test=y_test.values
y_cv=y_cv.values

In [24]:
# Creating and training model also validating model with validation data
model = Sequential()

model.add(Dense(12, activation='relu', input_shape=(602,)))

model.add(Dense(8, activation='relu'))

model.add(Dense(30, activation='softmax'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(X_tr, y_train,epochs=20, batch_size=1, verbose=1,validation_data=(X_cv,y_cv))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f72ce18e250>

In [25]:
# Evaluating model
model.evaluate(X_te,y_test)



[0.42164620757102966, 0.0008223684271797538]

In [26]:
# Loading new unseen data for testing
test=pd.read_csv('./data/test.csv')
test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [27]:
target=['question_asker_intent_understanding','question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking','question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self','question_multi_intent',
       'question_not_really_a_question','question_opinion_seeking', 'question_type_choice',
       'question_type_compare','question_type_consequence','question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure','question_type_reason_explanation',
       'question_type_spelling','question_well_written',
       'answer_helpful','answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions','answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

In [28]:
# Dropping irrelevant columns from data
drop_cols=['question_user_name','question_user_page','answer_user_name','answer_user_page','url']
test.drop(drop_cols,inplace=True,axis=1)
test['question']=test['question_title']+test['question_body']
test.drop(['question_title','question_body'],axis=1,inplace=True)

In [29]:
# Processing qustions and answers removing all special characters.
import re
questions=[]
answers=[]
for sent in test['question']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    questions.append(sent.strip())
for sent in test['answer']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    answers.append(sent.strip())

test['questions']=questions
test['answers']=answers

In [31]:
cols=['question','answer', 'category', 'host']

# Appending index of host from host list to another array for future use using test data.
test_host_encoded=[]
for i in test['host']:
    if i in host_list:
        test_host_encoded.append(host_list.index(i))
    else:
        test_host_encoded.append(-1)

# Appending index of category from category list to another array for future use using test data.
test_cat_encoded=[]
for i in test['category']:
    if i in cat_list:
        test_cat_encoded.append(cat_list.index(i))

In [32]:
# Using Word2vec to make vector of word association for every question in test data.
w2v_question_test = []
for sentence in tqdm(test['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_test.append(vector)

print(len(w2v_question_test))
print(len(w2v_question_test[0]))
print(type(w2v_question_test))

100%|██████████| 476/476 [00:00<00:00, 615.79it/s]

476
300
<class 'list'>





In [33]:
# Using Word2vec to make vector of word association for every answer in test data.
w2v_answer_test = []
for sentence in tqdm(test['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_test.append(vector)

print(len(w2v_answer_test))
print(len(w2v_answer_test[0]))
print(type(w2v_answer_test))

100%|██████████| 476/476 [00:00<00:00, 518.66it/s]

476
300
<class 'list'>





In [34]:
# Configuring input data
X_te=np.hstack((w2v_question_test,w2v_answer_test,np.array(test_host_encoded).reshape(-1,1),np.array(test_cat_encoded).reshape(-1,1)))

In [35]:
# Make predicion using test data.
y_pred=model.predict(X_te)
y_pred

array([[0.07252132, 0.01276929, 0.00051476, ..., 0.00138883, 0.00871465,
        0.08666196],
       [0.07252132, 0.01276929, 0.00051476, ..., 0.00138883, 0.00871465,
        0.08666196],
       [0.07252132, 0.01276929, 0.00051476, ..., 0.00138883, 0.00871465,
        0.08666196],
       ...,
       [0.07252132, 0.01276929, 0.00051476, ..., 0.00138883, 0.00871465,
        0.08666196],
       [0.07252132, 0.01276929, 0.00051476, ..., 0.00138883, 0.00871465,
        0.08666196],
       [0.07252132, 0.01276929, 0.00051476, ..., 0.00138883, 0.00871465,
        0.08666196]], dtype=float32)

In [36]:
df = pd.DataFrame(y_pred,columns=target)
df['qa_id']=test['qa_id']

df=df[['qa_id','question_asker_intent_understanding', 'question_body_critical',
       'question_conversational', 'question_expect_short_answer',
       'question_fact_seeking', 'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']]

In [44]:
# Output of model.
df

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
1,46,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
2,70,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
3,132,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
4,200,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,9569,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
472,9590,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
473,9597,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
474,9623,0.072521,0.012769,0.000515,0.020633,0.030843,0.033643,0.012498,0.008941,0.002703,...,0.035954,0.103163,0.016429,0.199131,0.267494,0.050539,0.008262,0.001389,0.008715,0.086662
