# Data preprocessing 

In [1]:
import numpy as np
import pandas as pd
import time

## Try different preprocessing methods

In [30]:
def preprocess1(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df = df[['page_id_from','page_id_to']] #Extract only the 1st and the 3rd column
    # Convert page_id’s into consecutive integers
    uniqueIds = list(set(np.concatenate((df['page_id_to'].unique(),df['page_id_from'].unique()), axis=None)))
    uniqueIds = pd.Series(uniqueIds )
    new_page_id_from = [np.where(uniqueIds==x)[0][0] for x in df['page_id_from'].tolist()]
    new_page_id_to =  [np.where(uniqueIds==x)[0][0] for x in df['page_id_to'].tolist()]
    df['page_id_from']  = np.array(new_page_id_from)
    df['page_id_to']  = np.array(new_page_id_to)
    return df

def preprocess2(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df = df[['page_id_from','page_id_to']] 
    # Convert page_id’s into consecutive integers
    uniqueIds1 = df['page_id_from'].unique()
    uniqueIds2 = df['page_id_to'][~df['page_id_to'].isin(uniqueIds1)].unique()
    uniqueIds = np.concatenate((uniqueIds1,uniqueIds2), axis=None)
    new_page_id_from = [np.where(uniqueIds==x)[0][0] for x in df['page_id_from'].tolist()]
    new_page_id_to =  [np.where(uniqueIds==x)[0][0] for x in df['page_id_to'].tolist()]
    df['page_id_from']  = np.array(new_page_id_from)
    df['page_id_to']  = np.array(new_page_id_to)
    return df

def preprocess3(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df = df[['page_id_from','page_id_to']] 
    # Convert page_id’s into consecutive integers
    uniqueIds1 = df['page_id_from'].unique()
    uniqueIds2 = df['page_id_to'][~df['page_id_to'].isin(uniqueIds1)].unique()
    uniqueIds = np.concatenate((uniqueIds1,uniqueIds2), axis=None)
    df['page_id_from']  = df['page_id_from'].map(lambda x:np.where(uniqueIds==x)[0][0])
    df['page_id_to']  = df['page_id_to'].map(lambda x:np.where(uniqueIds==x)[0][0])
    return df

def preprocess4(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df = df[['page_id_from','page_id_to']] 
    # Convert page_id’s into consecutive integers
    uniqueIds1 = df['page_id_from'].unique()
    uniqueIds2 = df['page_id_to'][~df['page_id_to'].isin(uniqueIds1)].unique()
    uniqueIds = np.concatenate((uniqueIds1,uniqueIds2), axis=None)
    # use a dictionary instead
    dic = {v: k for k, v in enumerate(uniqueIds)}
    df['page_id_from']  = df['page_id_from'].map(lambda x:dic[x])
    df['page_id_to']  = df['page_id_to'].map(lambda x:dic[x])
    return df

In [2]:
file_path = "wikilink_graph.2002-03-01.csv"

In [20]:
t0 = time.time()
df = preprocess1(file_path)
df.to_csv('graph_small.csv',index = False)
print(time.time() - t0)

0.5937988758087158


In [21]:
t0 = time.time()
df = preprocess2(file_path)
df.to_csv('graph_small.csv',index = False)
print(time.time() - t0)

0.06493592262268066


In [22]:
t0 = time.time()
df = preprocess3(file_path)
df.to_csv('graph_small.csv',index = False)
print(time.time() - t0)

0.06723690032958984


In [23]:
t0 = time.time()
df = preprocess4(file_path)
df.to_csv('graph_small.csv',index = False)
print(time.time() - t0)

0.05719113349914551


** After applying different methods on the small dataset, we will use the 4th method, which takes least amount of time.**

##  Save the prepared data on HD

In [24]:
file_path = "wikilink_graph.2004-03-01.csv"

In [33]:
df = preprocess4(file_path)
df.to_csv('graph.csv',index = False)