Datasets:
*	ref_table_precinct_locations_PSGC.csv – lookup table for precincts
*	results_president.csv – precinct-level election results for the 2016 presidential race
*	results_vice-president.csv – precinct-level election results for the 2016 vice presidential race

Tasks:
1.	Create a denormalized table replacing precinct_code in the results_*.csv files with the columns: region, province, municipality, and barangay. 

2.	Create an interesting data visualization using this dataset.

Tools:
-	Code should be written in a Jupyter Notebook
-	For data transformation you can use Pandas
-	For visualization use matplotlib or seaborn
-	Put your code in a Github repository

Extra:
-	Bonus points if you use PySpark for data transformation
-	Bonus points if you use Tableau for visualization
 
Good luck and enjoy. 😊


In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import udf, explode
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
sc = SparkContext()
spark = SparkSession(sc)

In [3]:
president = spark.read.csv('results_president.csv', header=True)
vicepres = spark.read.csv('results_vice-president.csv', header=True)
mappings = spark.read.csv('ref_table_precinct_locations.csv', header=True)
psgc = spark.read.csv('ref_table_precinct_locations_PSGC.csv', header=True)

In [4]:
income_class_map = {
    '1st':'1st Class',
    '2nd':'2nd Class',
    '3rd':'3rd Class', 
    '4th':'4th Class',
    '5th':'5th Class',
    '6th':'6th Class',
    '-':'Unknown',
    'None':'None'
}
bytes_cleaner = udf(lambda x: str(x).replace('\xa0', u''))

In [5]:
mapper = (mappings.join(psgc.select('precinct_code',
                           'is_city',
                           'income_class', 
                           'population',
                           'land_area', 
                           'municipality_CM'),
              on=mappings.precinct_code==psgc.precinct_code,
              how='left')
              .drop(psgc.precinct_code)
)

In [6]:
votes_president = (president.select('precinct_code',
                  'candidate_name', 
                  president.votes.astype('int'))
        .groupby('precinct_code')
        .pivot('candidate_name')
        .max('votes')
        .join(mapper, 
              on=president.precinct_code == mapper.precinct_code, 
              how='left')
        .drop(mapper.precinct_code)
        .withColumn('vote_showup_rate', mapper['ballots_cast'] / mapper['registered_voters'])
        .withColumn('income_class', bytes_cleaner(mapper.income_class))
        .replace(to_replace=income_class_map, subset=['income_class'])
        .drop(president.precinct_code)
)

IllegalArgumentException: 'Unsupported class file major version 57'

In [None]:
votes_vice = (vicepres.select('precinct_code',
                  'candidate_name', 
                  vicepres.votes.astype('int'))
        .groupby('precinct_code')
        .pivot('candidate_name')
        .max('votes')
        .join(mapper, 
              on=vicepres.precinct_code == mapper.precinct_code, 
              how='left')
        .drop(mapper.precinct_code)
        .withColumn('vote_showup_rate', mapper['ballots_cast'] / mapper['registered_voters'])
        .withColumn('income_class', bytes_cleaner(mapper.income_class))
        .replace(to_replace=income_class_map, subset=['income_class'])
        .drop(vicepres.precinct_code)
)