Datasets:
*	ref_table_precinct_locations_PSGC.csv – lookup table for precincts
*	results_president.csv – precinct-level election results for the 2016 presidential race
*	results_vice-president.csv – precinct-level election results for the 2016 vice presidential race

Tasks:
1.	Create a denormalized table replacing precinct_code in the results_*.csv files with the columns: region, province, municipality, and barangay. 

2.	Create an interesting data visualization using this dataset.

Tools:
-	Code should be written in a Jupyter Notebook
-	For data transformation you can use Pandas
-	For visualization use matplotlib or seaborn
-	Put your code in a Github repository

Extra:
-	Bonus points if you use PySpark for data transformation
-	Bonus points if you use Tableau for visualization
 
Good luck and enjoy. 😊


# Denormalize Data Using PySpark

In [382]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import udf, first
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
sc = SparkContext()
spark = SparkSession(sc)

In [329]:
president = spark.read.csv('results_president.csv', header=True)
vicepres = spark.read.csv('results_vice-president.csv', header=True)
mappings = spark.read.csv('ref_table_precinct_locations.csv', header=True)
psgc = spark.read.csv('ref_table_precinct_locations_PSGC.csv', header=True)

In [378]:
president.first()

Row(precinct_code='69140005', contest_code='199009', candidate_name='DUTERTE, RODY (PDPLBN)', party_code='114', votes='124', col5='3', ballots_cast='639', col7='13', col8='8', timestamp='05/09/2016 17:05:50', pct_votes='0.194053208138')

In [330]:
income_class_map = {
    '1st':'1st Class',
    '2nd':'2nd Class',
    '3rd':'3rd Class', 
    '4th':'4th Class',
    '5th':'5th Class',
    '6th':'6th Class',
    '-':'Unknown',
    'None':'None'
}

In [331]:
bytes_cleaner = udf(lambda x: str(x).replace('\xa0', u''))

In [332]:
mapper = (mappings.join(psgc.select('precinct_code',
                           'is_city',
                           'income_class', 
                           'population',
                           'land_area', 
                           'municipality_CM'),
              on=mappings.precinct_code==psgc.precinct_code,
              how='left')
              .drop(psgc.precinct_code)
)

In [None]:
dfTopByJoin
  .groupBy($"hour")
  .agg(
    first("category").alias("category"),

+-------------+-----------------------+
|precinct_code|first(timestamp, false)|
+-------------+-----------------------+
|     10050007|    05/09/2016 18:28:46|
|     10050017|    05/09/2016 18:35:36|
|     10050070|    05/09/2016 19:43:57|
|     10080008|    05/09/2016 19:08:46|
|     10110019|    05/09/2016 18:38:53|
|     10120012|    05/09/2016 18:37:23|
|     10140018|    05/09/2016 18:34:55|
|     10170033|    05/09/2016 18:38:17|
|     10200026|    05/09/2016 17:32:22|
|     10210080|    05/09/2016 17:42:44|
|     10230061|    05/09/2016 19:50:44|
|     10280006|    05/09/2016 20:10:34|
|     10310148|    05/09/2016 20:05:54|
|     11120004|    05/09/2016 19:25:01|
|      1180010|    05/09/2016 19:37:38|
|     12110007|    05/09/2016 17:34:03|
|     12120029|    05/09/2016 22:14:17|
|     12130007|    05/09/2016 21:27:29|
|     12240008|    05/09/2016 22:14:09|
|     12380022|    05/09/2016 18:47:40|
+-------------+-----------------------+
only showing top 20 rows



In [391]:
votes_president = (president.select('precinct_code',
                  'candidate_name', 
                  president.votes.astype('int'))
        .groupby('precinct_code')
        .pivot('candidate_name')
        .max('votes')
        .join(mapper, 
              on=president.precinct_code == mapper.precinct_code, 
              how='left')
        .drop(mapper.precinct_code)
        .withColumn('voter_turnout', mapper['ballots_cast'] / mapper['registered_voters'])
        .withColumn('income_class', bytes_cleaner(mapper.income_class))
        .replace(to_replace=income_class_map, subset=['income_class'])
        .join((president.select('precinct_code', 
                                'timestamp')
                    .groupby('precinct_code')
                    .agg(first('timestamp').alias('timestamp'))), on='precinct_code', how='left')
        .drop(president.precinct_code)
)

In [334]:
votes_vice = (vicepres.select('precinct_code',
                  'candidate_name', 
                  vicepres.votes.astype('int'))
        .groupby('precinct_code')
        .pivot('candidate_name')
        .max('votes')
        .join(mapper, 
              on=vicepres.precinct_code == mapper.precinct_code, 
              how='left')
        .drop(mapper.precinct_code)
        .withColumn('voter_turnout', mapper['ballots_cast'] / mapper['registered_voters'])
        .withColumn('income_class', bytes_cleaner(mapper.income_class))
        .replace(to_replace=income_class_map, subset=['income_class'])
        .join((vicepres.select('precinct_code', 
                               'timestamp')
                    .groupby('precinct_code')
                    .agg(first('timestamp').alias('timestamp'))), on='precinct_code', how='left')
        .drop(vicepres.precinct_code)
)

In [335]:
# (votes_president.repartition(1)
#    .write.format("com.databricks.spark.csv")
#    .option("header", "true")
#    .save("president"))

# (votes_vice.repartition(1)
#    .write.format("com.databricks.spark.csv")
#    .option("header", "true")
#    .save("vice_president"))

In [324]:
vicepres.first()

Row(precinct_code='10030074', contest_code='299009', candidate_name='ROBREDO, LENI DAANG MATUWID (LP)', party_code='85', votes='185', col5='5', ballots_cast='535', col7='25', col8='9', timestamp='05/09/2016 17:07:51', pct_votes='0.345794392523')

# Get Geospatial Matches

In [314]:
from fuzzywuzzy import fuzz
from bs4 import BeautifulSoup
import requests
from time import sleep
import geopandas as gpd

In [318]:
ph3 = gpd.GeoDataFrame.from_file('gadm36_PHL_3.shp')

'Batangas'

In [336]:
gadm = pd.DataFrame()
gadm['NAME_1'] = ph3['NAME_1']
gadm['NAME_2'] = ph3['NAME_2']

In [365]:
province_mapper = {
    'DAVAO (DAVAO DEL NORTE)':'Davao del Norte',
    'NATIONAL CAPITAL REGION - MANILA':'Metropolitan Manila',
    'COTABATO (NORTH COT.)':'North Cotabato',
    'NATIONAL CAPITAL REGION - SECOND DISTRICT':'Metropolitan Manila', 
    'NATIONAL CAPITAL REGION - THIRD DISTRICT':'Metropolitan Manila',
    'NATIONAL CAPITAL REGION - FOURTH DISTRICT':'Metropolitan Manila', 
    'TAGUIG - PATEROS':'Metropolitan Manila',
    'DAVAO OCCIDENTAL':'Davao del Sur',
    'SAMAR (WESTERN SAMAR)':'Samar',
    'EUROPE':'Overseas',
    'MIDDLE EAST AND AFRICAS':'Overseas',
    'NORTH AND LATIN AMERICA':'Overseas',
    'ASIA':'Overseas'
}

In [366]:
province_NAME = udf(lambda x: sorted([(i, fuzz.ratio(str(x).lower(), i.lower())) \
                    for i in gadm['NAME_1'].unique()], key=lambda x:-x[1])[0][0]\
                    if str(x) not in province_mapper.keys() else province_mapper[x])

In [367]:
votes_vice = votes_vice.withColumn('NAME_1', province_NAME(votes_vice['province']))
votes_president = votes_president.withColumn('NAME_1', province_NAME(votes_president['province']))

In [375]:
really = votes_vice.toPandas()

In [376]:
gadm.drop_duplicates()

Unnamed: 0,NAME_1,NAME_2
0,Abra,Bangued
31,Abra,Boliney
39,Abra,Bucay
60,Abra,Bucloc
64,Abra,Daguioman
...,...,...
41825,Zamboanga Sibugay,Roseller Lim
41851,Zamboanga Sibugay,Siay
41880,Zamboanga Sibugay,Talusan
41894,Zamboanga Sibugay,Titay


In [377]:
really[['NAME_1', 'municipality']]

Unnamed: 0,NAME_1,municipality
0,Batangas,BATANGAS CITY
1,Batangas,BATANGAS CITY
2,Batangas,BATANGAS CITY
3,Batangas,CALATAGAN
4,Batangas,LAUREL
...,...,...
90355,Compostela Valley,MAWAB
90356,Zamboanga Sibugay,BUUG
90357,Zamboanga Sibugay,KABASALAN
90358,Davao del Sur,SANTA MARIA


# Get Congress Data

In [311]:
request = requests.get('http://www.congress.gov.ph/members/', 'lxml')
soup = BeautifulSoup(request.text) if request.status_code == 200 else False
if soup:
    congress = pd.read_html(str(soup.select('body > div > div > div.col-md-8 > table')[0]))[0]

In [312]:
congress.columns = ['name', 'location']
congress = congress.loc[1:]
congress['municipality'] = congress['location'].apply(lambda x: str(x).split(',')[0])
congress = congress.groupby('municipality', as_index=False)['location'].count()\
.rename(columns={'location':'count'})
congress = congress[~congress['municipality'].str.contains('Party')]