In [122]:
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import hstack
from keras.layers import Dense
from gensim.models import Word2Vec
from keras.models import Sequential
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [123]:
# Load data from csv file
data = pd.read_csv('./data/train.csv')

In [124]:
data.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [125]:
data.isna().any()

qa_id                                    False
question_title                           False
question_body                            False
question_user_name                       False
question_user_page                       False
answer                                   False
answer_user_name                         False
answer_user_page                         False
url                                      False
category                                 False
host                                     False
question_asker_intent_understanding      False
question_body_critical                   False
question_conversational                  False
question_expect_short_answer             False
question_fact_seeking                    False
question_has_commonly_accepted_answer    False
question_interestingness_others          False
question_interestingness_self            False
question_multi_intent                    False
question_not_really_a_question           False
question_opin

In [126]:
target=['question_asker_intent_understanding','question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking','question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self','question_multi_intent',
       'question_not_really_a_question','question_opinion_seeking', 'question_type_choice',
       'question_type_compare','question_type_consequence','question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure','question_type_reason_explanation',
       'question_type_spelling','question_well_written',
       'answer_helpful','answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions','answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

drop_cols=['qa_id','question_user_name','question_user_page','answer_user_name','answer_user_page','url']

In [127]:
data.drop(drop_cols,inplace=True,axis=1)
data['question']=data['question_title']+data['question_body']
data.drop(['question_title','question_body'],axis=1,inplace=True)
data.columns

Index(['answer', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written', 'question'],
    

In [128]:
data=data[['question','answer', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']]

In [129]:
import re
questions=[]
answers=[]
for sent in data['question']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    questions.append(sent.strip())
for sent in data['answer']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    answers.append(sent.strip())

In [130]:
data['questions']=questions
data['answers']=answers
data.head(2)

Unnamed: 0,question,answer,category,host,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,...,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written,questions,answers
0,What am I losing when using extension tubes in...,"I just got extension tubes, so here's the skin...",LIFE_ARTS,photo.stackexchange.com,1.0,0.333333,0.0,0.0,0.0,0.0,...,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0,What am I losing when using extension tubes in...,I just got extension tubes so here s the skinn...
1,What is the distinction between a city and a s...,It might be helpful to look into the definitio...,CULTURE,rpg.stackexchange.com,1.0,1.0,0.0,0.5,1.0,1.0,...,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889,What is the distinction between a city and a s...,It might be helpful to look into the definitio...


In [131]:
cols=['question','answer', 'category', 'host']

In [132]:
Y=data[target]
data.drop(target,axis=1,inplace=True)
X_train,X_test,y_train,y_test=train_test_split(data,Y,test_size=0.20)
X_train,X_cv,y_train,y_cv=train_test_split(X_train,y_train,test_size=0.20)
host_list=list(X_train['host'].value_counts().index)
len(host_list)

63

In [133]:
train_host_encoded=[]
for i in X_train['host']:
    if i in host_list:
        train_host_encoded.append(host_list.index(i))
    else:
        train_host_encoded.append(-1)
        
test_host_encoded=[]
for i in X_test['host']:
    if i in host_list:
        test_host_encoded.append(host_list.index(i))
    else:
        test_host_encoded.append(-1)

cv_host_encoded=[]
for i in X_cv['host']:
    if i in host_list:
        cv_host_encoded.append(host_list.index(i))
    else:
        cv_host_encoded.append(-1)
        
cat_list=list(X_train['category'].value_counts().index)
len(cat_list)

5

In [134]:
train_cat_encoded=[]
for i in X_train['category']:
    if i in cat_list:
        train_cat_encoded.append(cat_list.index(i))
    else:
        train_cat_encoded.append(-1)

test_cat_encoded=[]
for i in X_test['category']:
    if i in cat_list:
        test_cat_encoded.append(cat_list.index(i))
    else:
        test_cat_encoded.append(-1)

cv_cat_encoded=[]
for i in X_cv['category']:
    if i in cat_list:
        cv_cat_encoded.append(cat_list.index(i))
    else:
        cv_cat_encoded.append(-1)

In [135]:
word_corpus=[]
for sentence in data['question']:
    sent=[]
    for w in sentence.split():
        sent.append(w)
    word_corpus.append(sent)
for sentence in data['answer']:
    sent=[]
    for w in sentence.split():
        sent.append(w)
    word_corpus.append(sent)

len(word_corpus)

12158

In [136]:
w2vmodel=Word2Vec(word_corpus, min_count=4,size=300)

In [137]:
w2v_question_train = []
for sentence in tqdm(X_train['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_train.append(vector)

print(len(w2v_question_train))
print(len(w2v_question_train[0]))
print(type(w2v_question_train))

100%|██████████| 3890/3890 [00:07<00:00, 551.74it/s]

3890
300
<class 'list'>





In [138]:
w2v_question_test = []
for sentence in tqdm(X_test['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_test.append(vector)

print(len(w2v_question_test))
print(len(w2v_question_test[0]))
print(type(w2v_question_test))

100%|██████████| 1216/1216 [00:02<00:00, 547.16it/s]

1216
300
<class 'list'>





In [139]:
w2v_question_cv = []
for sentence in tqdm(X_cv['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_cv.append(vector)

print(len(w2v_question_cv))
print(len(w2v_question_cv[0]))
print(type(w2v_question_cv))

100%|██████████| 973/973 [00:01<00:00, 563.07it/s]

973
300
<class 'list'>





In [140]:
w2v_answer_train = []
for sentence in tqdm(X_train['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_train.append(vector)

print(len(w2v_answer_train))
print(len(w2v_answer_train[0]))
print(type(w2v_answer_train))

100%|██████████| 3890/3890 [00:06<00:00, 563.87it/s]

3890
300
<class 'list'>





In [141]:
w2v_answer_test = []
for sentence in tqdm(X_test['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_test.append(vector)

print(len(w2v_answer_test))
print(len(w2v_answer_test[0]))
print(type(w2v_answer_test))

100%|██████████| 1216/1216 [00:02<00:00, 508.84it/s]

1216
300
<class 'list'>





In [142]:
w2v_answer_cv = []
for sentence in tqdm(X_cv['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_cv.append(vector)

print(len(w2v_answer_cv))
print(len(w2v_answer_cv[0]))
print(type(w2v_answer_cv))

100%|██████████| 973/973 [00:01<00:00, 540.94it/s]

973
300
<class 'list'>





In [143]:
X_tr=np.hstack((w2v_question_train,w2v_answer_train,np.array(train_host_encoded).reshape(-1,1),np.array(train_cat_encoded).reshape(-1,1)))

X_te=np.hstack((w2v_question_test,w2v_answer_test,np.array(test_host_encoded).reshape(-1,1),np.array(test_cat_encoded).reshape(-1,1)))

X_cv=np.hstack((w2v_question_cv,w2v_answer_cv,np.array(cv_host_encoded).reshape(-1,1),np.array(cv_cat_encoded).reshape(-1,1)))

In [144]:
y_train=y_train.values
y_test=y_test.values
y_cv=y_cv.values

In [145]:
model = Sequential()

model.add(Dense(12, activation='relu', input_shape=(602,)))

model.add(Dense(8, activation='relu'))

model.add(Dense(30, activation='softmax'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(X_tr, y_train,epochs=20, batch_size=1, verbose=1,validation_data=(X_cv,y_cv))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fbd4b83b190>

In [146]:
model.evaluate(X_te,y_test)



[0.42595189809799194, 0.003289473708719015]

In [147]:
test=pd.read_csv('./data/test.csv')
test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [148]:
target=['question_asker_intent_understanding','question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking','question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self','question_multi_intent',
       'question_not_really_a_question','question_opinion_seeking', 'question_type_choice',
       'question_type_compare','question_type_consequence','question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure','question_type_reason_explanation',
       'question_type_spelling','question_well_written',
       'answer_helpful','answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions','answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

In [149]:
drop_cols=['question_user_name','question_user_page','answer_user_name','answer_user_page','url']
test.drop(drop_cols,inplace=True,axis=1)
test['question']=test['question_title']+test['question_body']
test.drop(['question_title','question_body'],axis=1,inplace=True)

In [150]:
import re
questions=[]
answers=[]
for sent in test['question']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    questions.append(sent.strip())
for sent in test['answer']:
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    answers.append(sent.strip())

In [151]:
test['questions']=questions
test['answers']=answers

In [152]:
cols=['question','answer', 'category', 'host']

test_host_encoded=[]
for i in test['host']:
    if i in host_list:
        test_host_encoded.append(host_list.index(i))
    else:
        test_host_encoded.append(-1)

test_cat_encoded=[]
for i in test['category']:
    if i in cat_list:
        test_cat_encoded.append(cat_list.index(i))

In [153]:
w2v_question_test = []
for sentence in tqdm(test['question'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_question_test.append(vector)

print(len(w2v_question_test))
print(len(w2v_question_test[0]))
print(type(w2v_question_test))

100%|██████████| 476/476 [00:01<00:00, 450.29it/s]

476
300
<class 'list'>





In [154]:
w2v_answer_test = []
for sentence in tqdm(test['answer'].values):
    vector = np.zeros(300)
    for word in sentence.split():
        if word in w2vmodel:
            vector += w2vmodel[word]
    w2v_answer_test.append(vector)

print(len(w2v_answer_test))
print(len(w2v_answer_test[0]))
print(type(w2v_answer_test))

100%|██████████| 476/476 [00:01<00:00, 387.00it/s]

476
300
<class 'list'>





In [155]:
X_te=np.hstack((w2v_question_test,w2v_answer_test,np.array(test_host_encoded).reshape(-1,1),np.array(test_cat_encoded).reshape(-1,1)))

In [156]:
y_pred=model.predict(X_te)
y_pred

array([[0.07298533, 0.0128782 , 0.00048958, ..., 0.0012996 , 0.00836576,
        0.08417019],
       [0.07298533, 0.0128782 , 0.00048958, ..., 0.0012996 , 0.00836576,
        0.08417019],
       [0.07298533, 0.0128782 , 0.00048958, ..., 0.0012996 , 0.00836576,
        0.08417019],
       ...,
       [0.07298533, 0.0128782 , 0.00048958, ..., 0.0012996 , 0.00836576,
        0.08417019],
       [0.07298533, 0.0128782 , 0.00048958, ..., 0.0012996 , 0.00836576,
        0.08417019],
       [0.07298533, 0.0128782 , 0.00048958, ..., 0.0012996 , 0.00836576,
        0.08417019]], dtype=float32)

In [157]:
df = pd.DataFrame(y_pred,columns=target)
df['qa_id']=test['qa_id']

df=df[['qa_id','question_asker_intent_understanding', 'question_body_critical',
       'question_conversational', 'question_expect_short_answer',
       'question_fact_seeking', 'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']]

In [158]:
df

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
1,46,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
2,70,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
3,132,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
4,200,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,9569,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
472,9590,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
473,9597,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
474,9623,0.072985,0.012878,0.00049,0.019576,0.029299,0.032117,0.01207,0.008669,0.00255,...,0.034411,0.106505,0.015846,0.21203,0.262869,0.049896,0.007693,0.0013,0.008366,0.08417
