In [13]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import sys
import copy

question_path = "bytecup2016data/question_info.txt"
user_path = "bytecup2016data/user_info.txt"
invited_info_path = "bytecup2016data/invited_info_train.txt"

q_column_names = ['q_id', 'q_tag', 'q_word_seq', 'q_char_seq', 'q_no_upvotes', 'q_no_answers', 'q_no_quality_answers']
u_column_names = ['u_id','e_expert_tags', 'e_desc_word_seq', 'e_desc_char_seq']
train_info_column_names = ['q_id','u_id','answered']

question_dataframe = pd.read_csv(question_path, names=q_column_names, sep = '\t')
user_dataframe = pd.read_csv(user_path, names = u_column_names, sep = '\t')
train_info_dataframe = pd.read_csv(invited_info_path, names = train_info_column_names, sep = '\t')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

all_word_desc_list = question_dataframe['q_word_seq'].tolist() + user_dataframe['e_desc_word_seq'].tolist()
all_char_desc_list = question_dataframe['q_char_seq'].tolist() + user_dataframe['e_desc_char_seq'].tolist()
all_topics_list = question_dataframe['q_tag'].tolist() + user_dataframe['e_expert_tags'].tolist()

word_vocabulary = set([word for sent in all_word_desc_list for word in str(sent).split('/')])
#possible inconsistency in the data ---- char seq np.nan found !
char_vocabulary = set([char for sent in all_char_desc_list for char in str(sent).split('/')])
topic_vocabulary = set([char for sent in all_topics_list for char in str(sent).split('/')])
print "Size of the word vocabulary :", len(word_vocabulary)
print "Size of the char vocabulary :", len(char_vocabulary)
print "Number of topics : ", len(topic_vocabulary)

cv_word = CountVectorizer(vocabulary=word_vocabulary, token_pattern=u'(?u)\\b\\w+\\b')
cv_char= CountVectorizer(vocabulary=char_vocabulary, token_pattern=u'(?u)\\b\\w+\\b')
cv_topic = CountVectorizer(vocabulary=topic_vocabulary, token_pattern=u'(?u)\\b\\w+\\b')

test_question_word_seq = '284 21 285 286 323223'
test_question_char_seq = '373 155 33 34 374 25 113 73'
print cv_word.fit_transform([test_question_word_seq]).shape
print cv_char.fit_transform([test_question_char_seq]).shape

Size of the word vocabulary : 37811
Size of the char vocabulary : 4023
Number of topics :  143
(1, 37811)
(1, 4023)


In [15]:
from scipy.sparse import csr_matrix, hstack, vstack

In [16]:
q_dict = dict()

In [17]:
for idx, entry in question_dataframe.iterrows():
    q_dict[entry['q_id']] = csr_matrix(hstack([cv_word.fit_transform([" ".join(entry['q_word_seq'].split('/'))]), [entry['q_tag']]]))

In [18]:
u_dict = dict()

In [19]:
for idx, entry in user_dataframe.iterrows():
    u_dict[entry['u_id']] = csr_matrix(hstack([cv_word.fit_transform([" ".join(entry['e_desc_word_seq'].split('/'))]), cv_topic.fit_transform([" ".join(entry['e_expert_tags'].split('/'))])]))

In [20]:
len(u_dict)

28763

In [21]:
X = list()
tempX = list()
y = list()

In [22]:
for idx, entry in train_info_dataframe.iterrows():
    tempX.append(csr_matrix(hstack([q_dict[entry['q_id']], u_dict[entry['u_id']]])))
    y.append(entry['answered'])

In [23]:
X = csr_matrix(vstack(tempX))

In [24]:
X

<245752x75766 sparse matrix of type '<type 'numpy.int64'>'
	with 4700037 stored elements in Compressed Sparse Row format>

In [25]:
from sklearn.naive_bayes import GaussianNB

In [26]:
gnb = GaussianNB()

In [None]:
gnb_model = gnb.fit(csr_matrix(X).toarray(), y)