In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn as sk
%matplotlib inline

In [12]:
BASE  = './bytecup2016data'
IINFO = BASE + '/invited_info_train.txt'
QINFO = BASE + '/question_info.txt'
UINFO = BASE + '/user_info.txt'
VAL   = BASE + '/validate_nolabel.txt'

invdata = pd.read_csv(IINFO, delim_whitespace=True, header=None, names=["qid", "uid", "answered"])
qdata   = pd.read_csv(QINFO, delim_whitespace=True, header=None, names=["qid", "qtag", "wseq", "cseq", "nvotes", "nans", "ntqans"])
udata   = pd.read_csv(UINFO, delim_whitespace=True, header=None, names=["uid", "exptag", "wseq", "cseq"])
valdata = pd.read_csv(VAL)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
# Process the qdata

def tokenize(text):
    return text.split("/")

# Convert the character sequence column into a bag of words kind of vector
# Refer: http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
cseq_vec = CountVectorizer(tokenizer=tokenize)
cseq_matrix = cseq_vec.fit_transform(qdata.cseq).toarray()

# Do 1-of-K encoding for tags
qtags = qdata["qtag"].apply(str)
qtag_vec = CountVectorizer(tokenizer=tokenize)
qtag_matrix = qtag_vec.fit_transform(qtags).toarray()

# Convert the numpy arrays to dataframes
cseq_pd = pd.DataFrame(cseq_matrix)
qtag_pd = pd.DataFrame(qtag_matrix)

# Merge
proc_qdata = pd.concat([qdata.qid, cseq_pd, qtag_pd, qdata.nvotes, qdata.nans, qdata.ntqans], axis = 1)

In [15]:
# Insert a column in valdata to store the predicted label probabilities
valdata.insert(2, "label", value = 0.0)

In [16]:
def prepare_training_data_for_user(uid):
    # Get entries for the user from invited data
    user_invdata = invdata[invdata.uid == uid]
    # Merge with processed qdata to get the training data for the user
    user_data = user_invdata.merge(proc_qdata, on="qid", how="inner").drop(["qid", "uid", "wseq"], axis = 1)
    user_train_labels = user_data.answered
    user_train_data = user_data.drop(["answered"], axis = 1)
    return user_train_data, user_train_labels

In [17]:
def get_val_data_for_user(uid):
    user_valdata = valdata[valdata.uid == uid]
    user_valdata = user_valdata.merge(proc_qdata, on="qid", how="inner").drop(["wseq", "label"], axis = 1)
    return user_valdata

In [18]:
from sklearn import linear_model

In [None]:
%%timeit -n 1
for uid in np.unique(valdata.uid):
    user_unique_labels = np.unique(invdata[invdata.uid == uid].answered)

    if len(user_unique_labels) != 1:
        user_train_data, user_train_labels = prepare_training_data_for_user(uid)
        if user_train_data.shape[0] > 0:
            regr = linear_model.LogisticRegression()
            regr.fit(user_train_data, user_train_labels)
    
    user_val_data = get_val_data_for_user(uid)
    user_val_trimmed_data = user_val_data.drop(["qid", "uid"], axis = 1)
    
    if len(user_unique_labels) != 1 and user_train_data.shape[0] > 0:
        predicted_proba = regr.predict_proba(user_val_trimmed_data)
    else:
        if len(user_unique_labels) == 0:
            user_unique_labels = [0]
        predicted_proba = np.array([[0.0, 1.0] if user_unique_labels[0] == 1 else [1.0, 0.0] for i in range(user_val_data.shape[0])])
    
    valdata.ix[valdata.uid == uid, 'label'] = predicted_proba[:, 1]
    
    
    

In [10]:
# Write output as CSV
valdata.to_csv("attempt1.csv")