Datasets:
*	ref_table_precinct_locations_PSGC.csv – lookup table for precincts
*	results_president.csv – precinct-level election results for the 2016 presidential race
*	results_vice-president.csv – precinct-level election results for the 2016 vice presidential race

Tasks:
1.	Create a denormalized table replacing precinct_code in the results_*.csv files with the columns: region, province, municipality, and barangay. 

2.	Create an interesting data visualization using this dataset.

Tools:
-	Code should be written in a Jupyter Notebook
-	For data transformation you can use Pandas
-	For visualization use matplotlib or seaborn
-	Put your code in a Github repository

Extra:
-	Bonus points if you use PySpark for data transformation
-	Bonus points if you use Tableau for visualization
 
Good luck and enjoy. 😊


In [53]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import udf, explode
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

In [49]:
president = spark.read.csv('results_president.csv', header=True)
mappings = spark.read.csv('ref_table_precinct_locations.csv', header=True)

In [50]:
president.first()

Row(precinct_code='69140005', contest_code='199009', candidate_name='DUTERTE, RODY (PDPLBN)', party_code='114', votes='124', col5='3', ballots_cast='639', col7='13', col8='8', timestamp='05/09/2016 17:05:50', pct_votes='0.194053208138')

In [37]:
mappings.createOrReplaceTempView('mapping')
president.createOrReplaceTempView('pres')

In [60]:
pres.head()

Unnamed: 0,precinct_code,contest_code,candidate_name,party_code,votes,col5,ballots_cast,col7,col8,timestamp,pct_votes
0,69140005,199009,"DUTERTE, RODY (PDPLBN)",114,124,3,639,13,8,05/09/2016 17:05:50,0.194053
1,69140005,199009,"DEFENSOR SANTIAGO, MIRIAM (PRP)",135,12,2,639,13,8,05/09/2016 17:05:50,0.018779
2,69140005,199009,"BINAY, JOJO (UNA)",163,94,1,639,13,8,05/09/2016 17:05:50,0.147105
3,69140005,199009,"ROXAS, MAR DAANG MATUWID (LP)",85,126,5,639,13,8,05/09/2016 17:05:50,0.197183
4,69140005,199009,"SEÑERES, ROY (WPPPMM)",165,1,6,639,13,8,05/09/2016 17:05:50,0.001565


In [66]:
president.groupby(president.precinct_code).pivot('candidate_name')

IllegalArgumentException: 'Unsupported class file major version 57'

In [61]:
query = '''
PIVOT(
    COUNT(precinct_code)
    FOR party_code IN (
    [114],
    [135]
    )
    )'''

In [62]:
spark.sql(query)

ParseException: "\nextraneous input 'PIVOT' expecting {'(', 'SELECT', 'FROM', 'ADD', 'DESC', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'INSERT', 'DELETE', 'DESCRIBE', 'EXPLAIN', 'SHOW', 'USE', 'DROP', 'ALTER', 'MAP', 'SET', 'RESET', 'START', 'COMMIT', 'ROLLBACK', 'REDUCE', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'DFS', 'TRUNCATE', 'ANALYZE', 'LIST', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'EXPORT', 'IMPORT', 'LOAD'}(line 2, pos 0)\n\n== SQL ==\n\nPIVOT(\n^^^\n    COUNT(precinct_code)\n    FOR party_code IN (\n    [114],\n    [135]\n    )\n    )\n"

In [44]:
precincts = pd.read_csv('ref_table_precinct_locations.csv')
mapping = pd.read_csv('ref_table_precinct_locations_PSGC.csv')
pres = pd.read_csv('results_president.csv', nrows=100000)
vicepres = pd.read_csv('results_vice-president.csv', nrows=100000)

In [45]:
precincts

Unnamed: 0,precinct_code,region,province,municipality,barangay,registered_voters,ballots_cast
0,55170026,REGION I,PANGASINAN,CALASIAO,BUENLAG,592,503
1,55170027,REGION I,PANGASINAN,CALASIAO,BUENLAG,526,458
2,55170023,REGION I,PANGASINAN,CALASIAO,BUENLAG,723,649
3,55170022,REGION I,PANGASINAN,CALASIAO,BUENLAG,768,650
4,55170025,REGION I,PANGASINAN,CALASIAO,BUENLAG,787,662
...,...,...,...,...,...,...,...
90637,72080005,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,DILAND,327,284
90638,72080007,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,HEAD TIPAN,252,235
90639,72080018,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,SANTO TOMAS,628,529
90640,72080017,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,SANTO TOMAS,532,474


In [46]:
mapping

Unnamed: 0.1,Unnamed: 0,precinct_code,region,province,municipality,barangay,registered_voters,ballots_cast,PSGC_CM,is_city,income_class,population,land_area,province_CM,municipality_CM
0,0,55170026,REGION I,PANGASINAN,CALASIAO,BUENLAG,592,503,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
1,1,55170027,REGION I,PANGASINAN,CALASIAO,BUENLAG,526,458,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
2,2,55170023,REGION I,PANGASINAN,CALASIAO,BUENLAG,723,649,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
3,3,55170022,REGION I,PANGASINAN,CALASIAO,BUENLAG,768,650,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
4,4,55170025,REGION I,PANGASINAN,CALASIAO,BUENLAG,787,662,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90637,90637,72080005,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,DILAND,327,284,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA
90638,90638,72080007,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,HEAD TIPAN,252,235,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA
90639,90639,72080018,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,SANTO TOMAS,628,529,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA
90640,90640,72080017,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,SANTO TOMAS,532,474,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA
