# Load in data 
+ Data comes as a tar file
+ I opted to just load in files into pandas instead of attempting to unwrap that tar file in python
+ We have a file containing all of the edges facebook.csv
+ What we need to do is combine the .feat files with the .featnames files
+ below you can see what each of these files looks like

In [1]:
import pandas as pd 
import numpy as np

In [2]:
example= pd.read_csv('0.feat',sep=" ",header=None)
print(example.shape)
example.head()


(347, 225)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,215,216,217,218,219,220,221,222,223,224
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
example= pd.read_csv('0.featnames',sep=" ",header=None)
print(example.shape)
example.head()

(224, 4)


Unnamed: 0,0,1,2,3
0,0,birthday;anonymized,feature,0
1,1,birthday;anonymized,feature,1
2,2,birthday;anonymized,feature,2
3,3,birthday;anonymized,feature,3
4,4,birthday;anonymized,feature,4


# structure of our data

+  Column values in the .feat files(225 columns) are the equivalent index value for our .featnames files(224 rows)
    + The difference of 1 in the shape is due to the 1st column in the .feat file representing our node value and not a feature value
+ First goal is to join the names from .featnames in the feat files


### Build names lists
+ our files are stored as a random int.name
    + I.E  '0.featnames' and '0.feat'
     + Below we use list comprehension to build out file names


In [4]:
names=["0","107","1684","1912","3437",'348',"3980","414","686","698"]
feat_file_names= [x+".feat" for x in names]
featnames_filenames=[x+".featnames" for x in names]

In [5]:
# check names
featnames_filenames

['0.featnames',
 '107.featnames',
 '1684.featnames',
 '1912.featnames',
 '3437.featnames',
 '348.featnames',
 '3980.featnames',
 '414.featnames',
 '686.featnames',
 '698.featnames']

### Load files into pandas 

In [6]:
list_feat_names=[]
list_features=[]
for i,x in enumerate(featnames_filenames):
    list_feat_names.append(pd.read_csv(x,sep=" ",header=None))
    list_features.append(pd.read_csv(feat_file_names[i],sep=" ",header=None))

### Check shape matches earlier shape 

In [7]:
list_feat_names[0].shape

(224, 4)

In [8]:
list_features[0].shape

(347, 225)

### Practice on one set of each document

In [9]:
df=list_features[0]
df.head()
indexes=df.iloc[:,0]
df=df.iloc[:,1:]
df.columns= list_feat_names[0][1]
df["node"]=indexes
df.head()


1,birthday;anonymized,birthday;anonymized.1,birthday;anonymized.2,birthday;anonymized.3,birthday;anonymized.4,birthday;anonymized.5,birthday;anonymized.6,birthday;anonymized.7,education;classes;id;anonymized,education;classes;id;anonymized.1,...,work;start_date;anonymized,work;start_date;anonymized.1,work;start_date;anonymized.2,work;start_date;anonymized.3,work;start_date;anonymized.4,work;start_date;anonymized.5,work;start_date;anonymized.6,work;start_date;anonymized.7,work;with;id;anonymized,node
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


# Build df's as list

In [10]:
final_product=[]
for i,x in enumerate(list_features):
    df=x
    indexes=x.iloc[:,0]
    df=x.iloc[:,1:]
    df.columns= list_feat_names[i][1].astype(str)+'_'+list_feat_names[i][3].astype(str)
  #  df["nodes"]=indexes
    df.index=indexes
    final_product.append(df)
    
    

## check shape of list of df

In [11]:
for x in final_product:
    print(x.shape)

(347, 224)
(1045, 576)
(792, 319)
(755, 480)
(547, 262)
(227, 161)
(59, 42)
(159, 105)
(170, 63)
(66, 48)


## Build dictionary from df

In [12]:
my_nodes={}
for node in range(1,4036):
    my_nodes.setdefault(node, [])


for x in final_product:
    n=0
    for node in range(1,4036):
        for index in x.index:
            if index==node:
                my_nodes[node].append(x.loc[node,])
                

#my_nodes[node].append({x.loc[node,][0].index: x.loc[node,][0].values})

## Check for duplicate values

In [13]:
index_list=[]
duplicates_index=[]
total_rows=0
for x in final_product:
    for index in x.index:
        total_rows+=1
        if index not in index_list:
            index_list.append(index)
        else:
            duplicates_index.append(index)


len(duplicates_index)

132

In [14]:
import os
os.getcwd()

'E:\\facebook'

In [15]:
var = []
var1 = []
for i in range(len(featnames_filenames)):
    c = pd.read_csv(featnames_filenames[i], header=None,)[0].tolist()
    for j in range(len(c)):
        var1.append(' '.join(c[j].split(' ')[1:-2]))
    for v in range(len(var1)):
        var.append(' '.join(var1[v].split(';')[:-1]))
var = pd.Series(var).unique().tolist()
var

['birthday',
 'education classes id',
 'education concentration id',
 'education degree id',
 'education school id',
 'education type',
 'education with id',
 'education year id',
 'first_name',
 'gender',
 'hometown id',
 'languages id',
 'last_name',
 'locale',
 'location id',
 'work employer id',
 'work end_date',
 'work location id',
 'work position id',
 'work start_date',
 'work with id',
 'middle_name',
 'work from id',
 'work projects id',
 'religion',
 'name',
 'political']

# Build dictionary of Attribute category: categorical answer
+ Apparently some nodes are not present in dataset, so i had to run try except 
+ I know its not exactly in the form we need but im sure its step or two away

In [16]:
blank_nodes={}
n=0
for node in range(1,4036):
    blank_nodes.setdefault(node, [])


for node in range(1,4036):
    #print(n)
    try:
        node_col_names = my_nodes[node][0].index
        node_col_vals = my_nodes[node][0].values
        for i,x in enumerate(node_col_vals):
            if x==1:
                blank_nodes[node].append(node_col_names[i].split("anonymized_"))
    except:
        continue
    #n+=1
            

In [21]:
blank_nodes



{1: [['gender;', '77'], ['locale;', '127']],
 2: [['education;school;id;', '35'],
  ['education;type;', '53'],
  ['education;type;', '55'],
  ['education;year;id;', '57'],
  ['gender;', '78'],
  ['languages;id;', '92'],
  ['languages;id;', '98'],
  ['last_name;', '114'],
  ['locale;', '126'],
  ['location;id;', '135']],
 3: [['birthday;', '7'],
  ['education;concentration;id;', '14'],
  ['education;school;id;', '34'],
  ['education;school;id;', '50'],
  ['education;type;', '53'],
  ['education;type;', '55'],
  ['education;year;id;', '59'],
  ['education;year;id;', '65'],
  ['gender;', '78'],
  ['languages;id;', '92'],
  ['locale;', '127'],
  ['location;id;', '137'],
  ['work;end_date;', '168'],
  ['work;end_date;', '170'],
  ['work;location;id;', '137'],
  ['work;start_date;', '164'],
  ['work;start_date;', '202']],
 4: [['education;school;id;', '50'],
  ['education;type;', '53'],
  ['education;type;', '55'],
  ['education;with;id;', '56'],
  ['gender;', '78'],
  ['locale;', '127']],
 

# Build blank df of NAN

## Prove col names are same

In [18]:
# my_list=[x for x in my_nodes[1][0].index]
# my_list2= [x for x in df.iloc[0,:].index]

# for x in my_list:
#     if x in my_list2:
#         print(x)

## Build blank df 


## trying to build df's w concat

In [19]:
#df = pd.DataFrame(np.nan, index=np.arange(1,4040), columns=col_list)

# new_df=pd.DataFrame(data = [my_nodes[1][0].values] * len(my_nodes[1][0]), columns = my_nodes[1][0].index)
# df.merge(new_df, left_index=True, right_index=True)
         

# df.merge(my_nodes[1][0].to_frame().T, left_index=True, right_index=True)
# my_nodes[1][0].to_frame().T
# new_df= pd.concat([final_product[0],final_product[1],final_product[2],final_product[3],final_product[4],final_product[5],final_product[6],final_product[7],final_product[8],final_product[9]])
# df_9=pd.concat([final_product[0],final_product[1],final_product[2]])