## 0b - Make Outlink Inclusion Data Frame
This is a dataframe of one hot encoded outlink inclusion to work with later

import:

    00_en_as_rev_outlinks_m.pkl
    00_en_as_rev_time_m.pkl
    00_ar_as_rev_outlinks_m.pkl
    00_ar_as_rev_time_m.pkl
    
output:

    00_outlinkInclusiondf.pkl
    00_outlinkInclusiondf_ar.pkl

The Plan:
- get data
- make a unique outlinks dict
- make list of all of the unique outlinks used througout time
- go through and create a new dictionary with ones and values correlating to everyOutlink
- make a df with everything
- clean up timestamp information
- clean up timestamp information
- get rid of revision ids and transpose, switching the axes makes the heat map easier to read
- save data

1. English
2. Arabic

In [10]:
import pandas as pd
import numpy as np
import pickle
import os
import itertools

In [11]:
#function for the code to merge multiple lists

def merge(*lists):
    newlist = []
    for i in lists:
            newlist.extend(i)
    return newlist

### 1. Make English Dataframe

In [19]:
#get data

#move to Data folder
os.chdir("../Data")

#get df revid:outlinks dict
with open("00_en_as_rev_outlinks_m.pkl", 'rb') as pickleFile:
    outlinks = pickle.load(pickleFile)
    
#get df time:revid dict
with open("00_en_as_rev_time_m.pkl", 'rb') as pickleFile:
    en_freq_revids = pickle.load(pickleFile)

In [9]:
#make a unique outlinks dict

outlinksUnique = {}
keys = outlinks.keys()

for i in keys:
    outlinksUnique[i] = np.unique(outlinks[i]).tolist()

In [13]:
#make list of all of the unique outlinks used througout time
#this is only a list of outlinks to be later used for the df and heat map                      

everyOutlink = merge(outlinks.values()) #makes lists of lists
everyOutlink = list(itertools.chain.from_iterable(everyOutlink)) #puts them all in one list
everyOutlink = np.unique(everyOutlink).tolist() #a list of alphabetized unique outlinks

In [15]:
#go through and create a new dictionary with ones and values correlating to everyOutlink

binaryRevOutlinkDict = {}
keys = outlinksUnique.keys()

for rev in keys:
    binaryRevOutlinkDict[rev] = [1 if r in outlinksUnique[rev] else 0 for r in everyOutlink]

In [16]:
#make a df with everything

#make a df
df = pd.DataFrame.from_dict(binaryRevOutlinkDict, orient='index', columns=everyOutlink )
df.reset_index(inplace=True) #make the revision ids a column 

#change name
names = df.columns.tolist()
names[names.index('index')] = 'revs_id'
df.columns = names

In [21]:
#clean up timestamp information
times = en_freq_revids.index

times = times.date #making it so its just the date

df['timestamp'] = times #add the timestamps
df['timestamp'] = pd.to_datetime(df['timestamp']) #stuff you have to do bc its a different format
df.set_index("timestamp", inplace=True, drop=True)


In [None]:
# df.head()

In [22]:
#get rid of revision ids and transpose, switching the axes makes the heat map easier to read
data = df.iloc[: , 1:] #drop revs_id

data = data.transpose() #flip
data.index.name = None

In [24]:
# data.head()

In [25]:
#save data frame
with open('00_outlinkInclusiondf_en.pkl', 'wb') as f:
    pickle.dump(data, f)

### 2. Make Arabic Dataframe

In [26]:
#get data

#move to Data folder
# os.chdir("../Data")

#get df revid:outlinks dict
with open("00_ar_as_rev_outlinks_m.pkl", 'rb') as pickleFile:
    ar_outlinks = pickle.load(pickleFile)
    
#get df time:revid dict
with open("00_ar_as_rev_time_m.pkl", 'rb') as pickleFile:
    ar_freq_revids = pickle.load(pickleFile)

In [27]:
#make a unique outlinks dict

ar_outlinksUnique = {}
keys = ar_outlinks.keys()

for i in keys:
    ar_outlinksUnique[i] = np.unique(ar_outlinks[i]).tolist()

In [28]:
#make list of all of the unique outlinks used througout time
#this is only a list of outlinks to be later used for the df and heat map

ar_everyOutlink = merge(ar_outlinks.values()) #makes lists of lists
ar_everyOutlink = list(itertools.chain.from_iterable(ar_everyOutlink)) #puts them all in one list
ar_everyOutlink = np.unique(ar_everyOutlink).tolist() #a list of alphabetized unique outlinks

# len(ar_everyOutlink)

In [29]:
#go through and create a new dictionary with ones and values correlating to everyOutlink

ar_binaryRevOutlinkDict = {}
keys = ar_outlinksUnique.keys()

for rev in keys:
    ar_binaryRevOutlinkDict[rev] = [1 if r in ar_outlinksUnique[rev] else 0 for r in ar_everyOutlink]


In [30]:
#make a df with everything

#make a df
ar_df = pd.DataFrame.from_dict(ar_binaryRevOutlinkDict, orient='index', columns=ar_everyOutlink )
ar_df.reset_index(inplace=True) #make the revision ids a column 

#change name
names = ar_df.columns.tolist()
names[names.index('index')] = 'revs_id'
ar_df.columns = names

# ar_df.head()

In [31]:
#clean up timestamp information
times = ar_freq_revids.index

times = times.date #making it so its just the date

ar_df['timestamp'] = times #add the timestamps
ar_df['timestamp'] = pd.to_datetime(ar_df['timestamp']) #stuff you have to do bc its a different format
ar_df.set_index("timestamp", inplace=True, drop=True)


In [34]:
#get rid of revision ids and transpose, switching the axes makes the heat map easier to read
ar_data = ar_df.iloc[: , 1:] #drop revs_id

ar_data = ar_data.transpose() #flip
ar_data.index.name = None


In [35]:
# ar_data.head()

In [33]:
#save data 
with open('00_outlinkInclusiondf_ar.pkl', 'wb') as f:
    pickle.dump(ar_data, f)