In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import random
import pickle
from tqdm import tqdm
from multiprocessing import Pool
# This uses the Answers.csv file from the 10% Stack Overflow data
answer_file = "../Answers.csv"
# This edge list is the intermediate file used for graph building
edges_list_file = "../answer_edges.txt"

In [3]:
benchmark_1_data_file = "output/benchmark_1.txt"
benchmark_2_data_file = "output/benchmark_2.txt"
benchmark_3_data_file = "output/benchmark_3.txt"

question_header = 'q_'
user_header = 'u_'
tag_header = 't_'


## Pre-processing

In [4]:
# loads data with pands, it eats up memory, but parsing with pyspark is much more work
df = pd.read_csv("../Answers.csv", encoding="ISO-8859-1")
df.head(5)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49.0,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."


In [5]:
df.shape

(2014516, 6)

In [6]:
# Question_ids and user_ids may overlap, but that does not mean questions are users!!!
# Soln: each question_id += max_user_id
max_user_id = df[['OwnerUserId']].max()
max_user_id

OwnerUserId    7045028.0
dtype: float64

In [7]:
edge_df = df[['OwnerUserId', 'ParentId']]
# 1. drop null values
edge_df = edge_df.dropna()
# 2. make parentIds unique
edge_df = edge_df.assign(newParentId=lambda x: x.ParentId + max(max_user_id))
edge_df = edge_df.drop(['ParentId'], axis=1)
# 3. add weights to edges
edge_df['EdgeWeight'] = 1
# 4. cast the datafraem to int type
edge_df = edge_df.astype('int32')
edge_df.head(2)

Unnamed: 0,OwnerUserId,newParentId,EdgeWeight
0,61,7045118,1
1,26,7045108,1


## Benchmark on similar edges

In [8]:
import random
from collections import Counter
# parameters
n_test_edge = 1000
n_steps = 1000
teleportation_alpha = 0.3
origin_teleport_alpha = 0.7
early_stop_threshold = 20

def load_benchmark_data(benchmark_file, print_first_5_data=True):
    benchmark_data = []
    with open(benchmark_file, 'r') as input_file:
        for line in input_file:
            benchmark_data.append(line.strip().split())
    print(np.array(benchmark_data).shape)
    if print_first_5_data:
        print(benchmark_data[:5])
    return benchmark_data



In [9]:
# load benchmark file
b1_data = load_benchmark_data(benchmark_1_data_file)
b2_data = load_benchmark_data(benchmark_2_data_file)
b3_data = load_benchmark_data(benchmark_3_data_file)

(3000, 2)
[['q_16446430', 'u_901048'], ['q_6469070', 'u_6782'], ['q_16391600', 'u_84206'], ['q_35926750', 'u_4564247'], ['q_30911550', 'u_1501794']]
(3000, 2)
[['q_728360', 'u_161515'], ['q_3662410', 'u_107612'], ['q_880230', 'u_1059268'], ['q_832620', 'u_547185'], ['q_58190', 'u_3833113']]
(1925, 2)
[['q_6130', 'u_267'], ['q_8050', 'u_905'], ['q_9750', 'u_267'], ['q_9750', 'u_13'], ['q_9750', 'u_13']]


## Baseline in time

In [None]:
df["CreationDate"] = pd.to_datetime(df["CreationDate"])
df["ParentId"] = df["ParentId"] + 7053078
df.head(2)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01 14:45:37,7053168,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01 16:09:47,7053158,12,<p>I wound up using this. It is a kind of a ha...


In [None]:
for idx, curr_bench_data in enumerate([b1_data, b2_data, b3_data]):
    result = []
    df_copy = df.copy()
    for question, user in tqdm(curr_bench_data):
        suggestions = []
        question_temp = int(question[2:]) + 7053078
        user_temp = int(user[2:])

        df_user = df[(df.OwnerUserId == user_temp)].sort_values(by=['CreationDate'], ascending = False)
        curr_df = df[(df.OwnerUserId == user_temp) & (df.ParentId == question_temp)]
        curr_time = list(curr_df["CreationDate"])[0]

        df_user["filter_time"] = curr_time
        df_user = df_user[df_user.filter_time > df_user.CreationDate]
        if df_user.shape[0] >= 1:
            pre_question = list(df_user["ParentId"])[0]
            pre_time = list(df_user["CreationDate"])[0]
            
            df_copy["filter_time"] = pre_time 
            df_copy = df_copy[df_copy.CreationDate > df_copy.filter_time]
            df_copy["diff_time"] = df_copy["CreationDate"]  - df_copy["filter_time"]
            df_result = df_copy.drop_duplicates(["ParentId"])
            df_result = df_result.sort_values(by=["diff_time"])
            suggestions = list(df_result["ParentId"])[1:101]

        result.append((user, question, suggestions))
        
    save_to = "time_based_baseline" + str(idx +1) + ".pk1"
    with open(save_to, 'wb') as handle:
        pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)

 55%|█████▍    | 1644/3000 [00:37<00:24, 54.32it/s]

In [16]:
count = 0
for e in result:
    if int(e[1][2:]) + 7053078 in e[2]:
        count += 1

In [19]:
count/1000

0.4072398190045249

In [18]:
result[4][2] == result[5][2] 

True

In [None]:
result 