# Network Science Final Project
Roberto & Gacera

## Data Wrangling
### Conversion to nodes and edges of the 2019 election city-level results in Region I (Ilocos)
 - Network: bipartite (directed)
 - Nodes: cities and politicians
 - Edges: exists if a city voted for a politician (within a given threshold)
 - Weight: normalized votes (votes/votes of the winning candidate)

In [1]:
import pandas as pd
import pickle
import re

with open('city_votes_2019.pkl', 'rb') as f:
    df = pickle.load(f)

### Filter to Ilocos region

In [2]:
df = df[df.region=='REGION I']

#### Retrieve party from candidate official election name

In [3]:
df['party'] = df.candidate.apply(lambda x: re.findall(r'\((.*)\)', x)[0] if len(re.findall(r'\(.*\)', x)) > 0 else '')
df.loc[df.party=='', 'party'] = 'PARTY LIST'

In [4]:
df.head(2)

Unnamed: 0,region,province,city,position,candidate,candidate_party,votes,total_votes,percentage,party
0,REGION I,ILOCOS NORTE,ADAMS,SENATOR,"ABEJO, VANGIE (IND)",LGBTQ PARTY,6,5366,0.0011,IND
1,REGION I,ILOCOS NORTE,BACARRA,SENATOR,"ABEJO, VANGIE (IND)",LGBTQ PARTY,89,87217,0.001,IND


#### Determine winning candidates by getting the maximum percentage value per position per city

In [5]:
maxs = df.groupby(['region', 'province', 'city', 'position']).percentage.max().reset_index()
maxs.columns = ['region', 'province', 'city', 'position', 'maxs']

In [6]:
df = pd.merge(df, maxs)

#### Compute for weight of the edges: votes/ votes of winning candidate

In [7]:
df['per_sc'] = df.percentage/df.maxs

In [8]:
df.head(2)

Unnamed: 0,region,province,city,position,candidate,candidate_party,votes,total_votes,percentage,party,maxs,per_sc
0,REGION I,ILOCOS NORTE,ADAMS,SENATOR,"ABEJO, VANGIE (IND)",LGBTQ PARTY,6,5366,0.0011,IND,0.1731,0.006355
1,REGION I,ILOCOS NORTE,ADAMS,SENATOR,"AFUANG, ABNER (WPP)",LABOR PARTY PHILIPPINES,5,5366,0.0009,WPP,0.1731,0.005199


#### Filter to the top N candidates per position
 - senator: top 30
 - house of representative: top 5
 - mayor: top 5
 - vice mayor: top 5
 - party list: top 51
 - governor: all since there were only a few candidates
 - vice governor: all since there were only a few candidates

In [9]:
sorted(df.position.unique())

['MAYOR',
 'MEMBER, HOUSE OF REPRESENTATIVES',
 'PARTY LIST',
 'PROVINCIAL GOVERNOR',
 'PROVINCIAL VICE-GOVERNOR',
 'SENATOR',
 'VICE-MAYOR']

In [10]:
df = df.sort_values(['region', 'province', 'city', 'position', 'percentage'], ascending=False)
sen = df[df.position=='SENATOR']
rep = df[df.position=='MEMBER, HOUSE OF REPRESENTATIVES']
mayor = df[df.position=='MAYOR']
vmayor = df[df.position=='VICE-MAYOR']
plist = df[df.position=='PARTY LIST']
gov = df[df.position=='PROVINCIAL GOVERNOR']
vgov = df[df.position=='PROVINCIAL VICE-GOVERNOR']

In [11]:
sens = sen.groupby(['region', 'province', 'city', 'position']).head(30)
reps = rep.groupby(['region', 'province', 'city', 'position']).head(5)
mayors = mayor.groupby(['region', 'province', 'city', 'position']).head(5)
vmayors = vmayor.groupby(['region', 'province', 'city', 'position']).head(5)
plists = plist.groupby(['region', 'province', 'city', 'position']).head(51)

In [12]:
df = pd.concat([sens, reps, mayors, vmayors, plists, gov, vgov]).drop(columns=['maxs'])

In [13]:
df.head(2)

Unnamed: 0,region,province,city,position,candidate,candidate_party,votes,total_votes,percentage,party,per_sc
7687,REGION I,PANGASINAN,VILLASIS,SENATOR,"VILLAR, CYNTHIA (NP)",NACIONALISTA PARTY,21830,277610,0.0786,NP,1.0
7680,REGION I,PANGASINAN,VILLASIS,SENATOR,"POE, GRACE (IND)",LGBTQ PARTY,21041,277610,0.0757,IND,0.963104


### Create nodes table

#### Retrieve unique cities
Since some cities have the same name, get the unique cities using `province` and `city` columns.

In [14]:
cities = df[['province', 'city']].drop_duplicates().reset_index(drop=True).reset_index()

#### Set node type to CITY

In [15]:
cities['Node_Type'] = 'CITY'

In [16]:
cities.columns = ['Id', 'province', 'city', 'Node_Type']

In [17]:
cities['Label'] = cities.province + '/' + cities.city

In [18]:
cities = cities[['Id', 'Label', 'Node_Type']]

In [19]:
cities.tail()

Unnamed: 0,Id,Label,Node_Type
120,120,ILOCOS NORTE/BANNA (ESPIRITU),CITY
121,121,ILOCOS NORTE/BANGUI,CITY
122,122,ILOCOS NORTE/BADOC,CITY
123,123,ILOCOS NORTE/BACARRA,CITY
124,124,ILOCOS NORTE/ADAMS,CITY


#### Retrieve unique politicians

In [20]:
pols = df[['candidate', 'party']].drop_duplicates().reset_index(drop=True).reset_index()

In [21]:
pols.columns = ['Id', 'Label', 'Node_Type']

#### Change Id such that it continues from the city Id values

In [22]:
pols.Id = pols.Id + 125

In [23]:
pols.tail()

Unnamed: 0,Id,Label,Node_Type
733,858,"AQUINO, JULIUS (KDP)",KDP
734,859,"SINGSON, JERRY (BILEG)",BILEG
735,860,"ZARAGOZA, ANICKA (PDPLBN)",PDPLBN
736,861,"MARCOS, MARIANO II (NP)",NP
737,862,"RAMONES, MICHAEL (PDPLBN)",PDPLBN


#### Combine cities and politicians into 1 table

In [24]:
nodes = pd.concat([cities, pols]).reset_index(drop=True)

In [25]:
with open('ILOCOS_nodes_2019.pkl', 'wb') as f:
    pickle.dump(nodes, f)

In [26]:
nodes.to_csv('ILOCOS_nodes_2019.csv', index=False)

### Filter rows with only the significant normalized votes (>= median)

In [27]:
df.per_sc.describe()

count    11480.000000
mean         0.221897
std          0.309492
min          0.000000
25%          0.014144
50%          0.060663
75%          0.316539
max          1.000000
Name: per_sc, dtype: float64

In [28]:
df.shape

(11480, 11)

In [29]:
df = df[df.per_sc>=0.060663]

In [30]:
df.shape

(5740, 11)

In [31]:
df['prov_city'] = df.province + '/' + df.city

### Create combined table of main dataframe and node labels (for future reference)

In [33]:
df = pd.merge(df, nodes, left_on='prov_city', right_on='Label')

In [34]:
df = pd.merge(df, nodes, left_on='candidate', right_on='Label')

In [35]:
df.shape

(5740, 18)

In [36]:
df.head(2)

Unnamed: 0,region,province,city,position,candidate,candidate_party,votes,total_votes,percentage,party,per_sc,prov_city,Id_x,Label_x,Node_Type_x,Id_y,Label_y,Node_Type_y
0,REGION I,PANGASINAN,VILLASIS,SENATOR,"VILLAR, CYNTHIA (NP)",NACIONALISTA PARTY,21830,277610,0.0786,NP,1.0,PANGASINAN/VILLASIS,0,PANGASINAN/VILLASIS,CITY,125,"VILLAR, CYNTHIA (NP)",NP
1,REGION I,PANGASINAN,URDANETA CITY,SENATOR,"VILLAR, CYNTHIA (NP)",NACIONALISTA PARTY,42312,586456,0.0721,NP,0.990385,PANGASINAN/URDANETA CITY,1,PANGASINAN/URDANETA CITY,CITY,125,"VILLAR, CYNTHIA (NP)",NP


In [41]:
with open('ILOCOS_df_2019.pkl', 'wb') as f:
    pickle.dump(df, f)

### Create edges table

In [42]:
edges = df[['Id_x', 'Id_y', 'per_sc', 'position']]

In [43]:
edges.columns = ['Source', 'Target', 'Weight', 'Position']

#### Edge table:
- Source: city Id
- Target: politician Id
- Weight: normalized vote

In [44]:
edges.head(2)

Unnamed: 0,Source,Target,Weight,Position
0,0,125,1.0,SENATOR
1,1,125,0.990385,SENATOR


In [47]:
with open('ILOCOS_edges_2019.pkl', 'wb') as f:
    pickle.dump(edges, f)

In [48]:
edges.to_csv('ILOCOS_edges_2019.csv', index=False)