Datasets:
*	ref_table_precinct_locations_PSGC.csv – lookup table for precincts
*	results_president.csv – precinct-level election results for the 2016 presidential race
*	results_vice-president.csv – precinct-level election results for the 2016 vice presidential race

Tasks:
1.	Create a denormalized table replacing precinct_code in the results_*.csv files with the columns: region, province, municipality, and barangay. 

2.	Create an interesting data visualization using this dataset.

Tools:
-	Code should be written in a Jupyter Notebook
-	For data transformation you can use Pandas
-	For visualization use matplotlib or seaborn
-	Put your code in a Github repository

Extra:
-	Bonus points if you use PySpark for data transformation
-	Bonus points if you use Tableau for visualization
 
Good luck and enjoy. 😊


In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import udf
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
sc = SparkContext()
spark = SparkSession(sc)

In [53]:
president = spark.read.csv('results_president.csv', header=True)
vicepres = spark.read.csv('results_vice-president.csv', header=True)
mappings = spark.read.csv('ref_table_precinct_locations.csv', header=True)
psgc = spark.read.csv('ref_table_precinct_locations_PSGC.csv', header=True)

In [50]:
mapper = (mappings.join(psgc.select('precinct_code',
                           'is_city',
                           'income_class', 
                           'population',
                           'land_area', 
                           'municipality_CM'),
              on=mappings.precinct_code==psgc.precinct_code,
              how='left')
              .drop(psgc.precinct_code)
)

In [57]:
votes_president = (president.select('precinct_code',
                  'candidate_name', 
                  president.votes.astype('int'))
        .groupby('precinct_code')
        .pivot('candidate_name')
        .max('votes')
        .join(mapper, 
              on=president.precinct_code == mapper.precinct_code, 
              how='left')
        .drop(mapper.precinct_code)
        .toPandas()
)

In [75]:
votes_vice = (vicepres.select('precinct_code',
                  'candidate_name', 
                  vicepres.votes.astype('int'))
        .groupby('precinct_code')
        .pivot('candidate_name')
        .max('votes')
        .join(mapper, 
              on=vicepres.precinct_code == mapper.precinct_code, 
              how='left')
        .drop(mapper.precinct_code)
        .withColumn('vote_showup_rate', votes_vice['ballots_cast'] / votes_vice['registered_voters'])
)

In [78]:
votes_vice.limit(100).toPandas()

Unnamed: 0,precinct_code,"CAYETANO, ALAN PETER (IND)","ESCUDERO, CHIZ (IND)","HONASAN, GRINGO (UNA)","MARCOS, BONGBONG (IND)","ROBREDO, LENI DAANG MATUWID (LP)","TRILLANES, ANTONIO IV (IND)",region,province,municipality,barangay,registered_voters,ballots_cast,is_city,income_class,population,land_area,municipality_CM,vote_showup_rate
0,10050007,67,54,9,129,210,12,REGION IV-A,BATANGAS,BATANGAS CITY,BARANGAY 5 (POB.),602,491,1.0,1st Class-Special,,28296.0,BATANGAS CITY,0.815615
1,10050017,46,59,4,101,194,9,REGION IV-A,BATANGAS,BATANGAS CITY,BARANGAY 12 (POB.),600,426,1.0,1st Class-Special,,28296.0,BATANGAS CITY,0.710000
2,10050070,50,69,13,73,260,8,REGION IV-A,BATANGAS,BATANGAS CITY,BILOGO,553,491,1.0,1st Class-Special,,28296.0,BATANGAS CITY,0.887884
3,10080008,29,73,16,103,192,6,REGION IV-A,BATANGAS,CALATAGAN,BARANGAY 4 (POB.),528,436,0.0,2nd,51997.0,11200.0,CALATAGAN,0.825758
4,10110019,41,103,19,105,271,22,REGION IV-A,BATANGAS,LAUREL,GULOD,660,612,0.0,3rd,35674.0,7129.0,LAUREL,0.927273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,22280032,74,17,7,81,254,12,REGION VII,CEBU,MADRIDEJOS,TALANGNAN,646,559,0.0,4th,34905.0,2395.0,MADRIDEJOS,0.865325
96,22510042,190,56,14,33,216,18,REGION VII,CEBU,TOLEDO CITY,CAMBANG-UG,772,610,1.0,3rd Class,,21628.0,TOLEDO CITY,0.790155
97,22510051,209,52,7,40,245,15,REGION VII,CEBU,TOLEDO CITY,CANTABACO,790,628,1.0,3rd Class,,21628.0,TOLEDO CITY,0.794937
98,22510131,197,29,3,81,161,7,REGION VII,CEBU,TOLEDO CITY,SANGI,692,526,1.0,3rd Class,,21628.0,TOLEDO CITY,0.760116


In [4]:
mappings.createOrReplaceTempView('mapping')
president.createOrReplaceTempView('pres')

In [7]:
query = '''
SELECT *
FROM pres p
JOIN mapping m
WHERE p.precinct_code = m.precinct_code
LIMIT 100'''

In [10]:
spark.sql(query).toPandas()

Unnamed: 0,precinct_code,contest_code,candidate_name,party_code,votes,col5,ballots_cast,col7,col8,timestamp,pct_votes,precinct_code.1,region,province,municipality,barangay,registered_voters,ballots_cast.1
0,69140005,199009,"DUTERTE, RODY (PDPLBN)",114,124,3,639,13,8,05/09/2016 17:05:50,0.194053208138,69140005,REGION III,TARLAC,SAN MANUEL,LANAT,736,639
1,69140005,199009,"DEFENSOR SANTIAGO, MIRIAM (PRP)",135,12,2,639,13,8,05/09/2016 17:05:50,0.018779342723,69140005,REGION III,TARLAC,SAN MANUEL,LANAT,736,639
2,69140005,199009,"BINAY, JOJO (UNA)",163,94,1,639,13,8,05/09/2016 17:05:50,0.14710485133,69140005,REGION III,TARLAC,SAN MANUEL,LANAT,736,639
3,69140005,199009,"ROXAS, MAR DAANG MATUWID (LP)",85,126,5,639,13,8,05/09/2016 17:05:50,0.197183098592,69140005,REGION III,TARLAC,SAN MANUEL,LANAT,736,639
4,69140005,199009,"SEÑERES, ROY (WPPPMM)",165,1,6,639,13,8,05/09/2016 17:05:50,0.00156494522692,69140005,REGION III,TARLAC,SAN MANUEL,LANAT,736,639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,56310001,199009,"POE, GRACE (IND)",58,152,4,531,23,9,05/09/2016 17:05:50,0.286252354049,56310001,REGION IV-A,QUEZON,PANUKULAN,SAN JUAN (POB.),658,531
96,13110022,199009,"DUTERTE, RODY (PDPLBN)",114,172,3,503,28,21,05/09/2016 17:06:08,0.341948310139,13110022,REGION X,BUKIDNON,LIBONA,MAAMBONG,564,503
97,13110022,199009,"DEFENSOR SANTIAGO, MIRIAM (PRP)",135,2,2,503,28,21,05/09/2016 17:06:08,0.00397614314115,13110022,REGION X,BUKIDNON,LIBONA,MAAMBONG,564,503
98,13110022,199009,"BINAY, JOJO (UNA)",163,14,1,503,28,21,05/09/2016 17:06:08,0.0278330019881,13110022,REGION X,BUKIDNON,LIBONA,MAAMBONG,564,503


In [4]:
pd.read_csv('ref_table_precinct_locations.csv', nrows=100)

Unnamed: 0,precinct_code,region,province,municipality,barangay,registered_voters,ballots_cast
0,55170026,REGION I,PANGASINAN,CALASIAO,BUENLAG,592,503
1,55170027,REGION I,PANGASINAN,CALASIAO,BUENLAG,526,458
2,55170023,REGION I,PANGASINAN,CALASIAO,BUENLAG,723,649
3,55170022,REGION I,PANGASINAN,CALASIAO,BUENLAG,768,650
4,55170025,REGION I,PANGASINAN,CALASIAO,BUENLAG,787,662
...,...,...,...,...,...,...,...
95,55280024,REGION I,PANGASINAN,MAPANDAN,NILOMBOT,765,637
96,55280022,REGION I,PANGASINAN,MAPANDAN,NILOMBOT,725,611
97,55280023,REGION I,PANGASINAN,MAPANDAN,NILOMBOT,728,619
98,55280032,REGION I,PANGASINAN,MAPANDAN,TORRES,796,667


In [10]:
mapping = pd.read_csv('ref_table_precinct_locations_PSGC.csv')

In [14]:
sample = pd.read_csv('results_president.csv', nrows=100000)

In [17]:
mapping

Unnamed: 0.1,Unnamed: 0,precinct_code,region,province,municipality,barangay,registered_voters,ballots_cast,PSGC_CM,is_city,income_class,population,land_area,province_CM,municipality_CM
0,0,55170026,REGION I,PANGASINAN,CALASIAO,BUENLAG,592,503,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
1,1,55170027,REGION I,PANGASINAN,CALASIAO,BUENLAG,526,458,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
2,2,55170023,REGION I,PANGASINAN,CALASIAO,BUENLAG,723,649,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
3,3,55170022,REGION I,PANGASINAN,CALASIAO,BUENLAG,768,650,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
4,4,55170025,REGION I,PANGASINAN,CALASIAO,BUENLAG,787,662,15517000,0.0,1st,91109.0,4836.0,PANGASINAN,CALASIAO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90637,90637,72080005,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,DILAND,327,284,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA
90638,90638,72080007,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,HEAD TIPAN,252,235,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA
90639,90639,72080018,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,SANTO TOMAS,628,529,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA
90640,90640,72080017,REGION IX,ZAMBOANGA DEL NORTE,MUTIA,SANTO TOMAS,532,474,97208000,0.0,5th,11975.0,7358.0,ZAMBOANGA DEL NORTE,MUTIA


In [24]:
sample = sample.merge(mapping[['precinct_code', 'region', 'province', 'municipality', 'barangay']], on='precinct_code', how='left')

In [25]:
sample.pivot_table('')

Unnamed: 0,precinct_code,contest_code,candidate_name,party_code,votes,col5,ballots_cast,col7,col8,timestamp,pct_votes,region,province,municipality,barangay
0,69140005,199009,"DUTERTE, RODY (PDPLBN)",114,124,3,639,13,8,05/09/2016 17:05:50,0.194053,REGION III,TARLAC,SAN MANUEL,LANAT
1,69140005,199009,"DEFENSOR SANTIAGO, MIRIAM (PRP)",135,12,2,639,13,8,05/09/2016 17:05:50,0.018779,REGION III,TARLAC,SAN MANUEL,LANAT
2,69140005,199009,"BINAY, JOJO (UNA)",163,94,1,639,13,8,05/09/2016 17:05:50,0.147105,REGION III,TARLAC,SAN MANUEL,LANAT
3,69140005,199009,"ROXAS, MAR DAANG MATUWID (LP)",85,126,5,639,13,8,05/09/2016 17:05:50,0.197183,REGION III,TARLAC,SAN MANUEL,LANAT
4,69140005,199009,"SEÑERES, ROY (WPPPMM)",165,1,6,639,13,8,05/09/2016 17:05:50,0.001565,REGION III,TARLAC,SAN MANUEL,LANAT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,55460098,199009,"BINAY, JOJO (UNA)",163,38,1,483,9,5,05/09/2016 18:34:35,0.078675,REGION I,PANGASINAN,URDANETA CITY,PINMALUDPOD
99996,55460098,199009,"ROXAS, MAR DAANG MATUWID (LP)",85,23,5,483,9,5,05/09/2016 18:34:35,0.047619,REGION I,PANGASINAN,URDANETA CITY,PINMALUDPOD
99997,55460098,199009,"SEÑERES, ROY (WPPPMM)",165,0,6,483,9,5,05/09/2016 18:34:35,0.000000,REGION I,PANGASINAN,URDANETA CITY,PINMALUDPOD
99998,55460098,199009,"POE, GRACE (IND)",58,347,4,483,9,5,05/09/2016 18:34:35,0.718427,REGION I,PANGASINAN,URDANETA CITY,PINMALUDPOD
