In [1]:
import pandas as pd
import numpy as np
import nltk
import pyspark
import matplotlib.pyplot as plt
import os.path as osp
import os
import shutil

%matplotlib notebook

In [2]:
sc = pyspark.sql.SparkSession.Builder()\
        .master('yarn')\
        .appName('open-secrets-etl')\
        .config('spark.executor.instances', '2')\
        .config('spark.executor.memory', '3g')\
        .config('spark.executor.cores', '2')\
        .enableHiveSupport()\
        .getOrCreate()

In [4]:
sc.sql('SHOW TABLES').show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|        pac_spending|      false|
| default|pac_spending_by_type|      false|
| default|      pq_crp_cands18|      false|
| default|      pq_crp_cmtes18|      false|
| default|     pq_crp_indivs18|      false|
| default|  pq_crp_pac_other18|      false|
| default|       pq_crp_pacs18|      false|
+--------+--------------------+-----------+



In [5]:
sc.sql('SELECT * FROM pac_spending_by_type').limit(10).show()

+---------+--------------------+----------+------------+-----------------+------------+---------+
|total_amt|            pac_name|direct_ind|contrib_type|        cand_name|candidate_id|   pac_id|
+---------+--------------------+----------+------------+-----------------+------------+---------+
|6196944.0|Congressional Lea...|         I|         24A|   Jon Ossoff (D)|   N00040675|C00504530|
|6015567.0|National Republic...|         I|         24A|   Jon Ossoff (D)|   N00040675|C00075820|
|4971326.0|Democratic Congre...|         I|         24A| Karen Handel (R)|   N00035477|C00000935|
|4459673.0|Senate Leadership...|         I|         24A|    Roy Moore (R)|   N00041027|C00571703|
|3697210.0|   Great America PAC|         I|         24E| Donald Trump (R)|   N00023864|C00608489|
|3633469.0|Committee to Defe...|         I|         24E| Donald Trump (R)|   N00023864|C00544767|
|3607502.0|  Duty & Country PAC|         I|         24A| Evan Jenkins (R)|   N00035531|C00666388|
|3333570.0|Democrati

In [6]:
sc.sql('SELECT * FROM pq_crp_indivs18').limit(10).show()

+-----+-------------------+------------+------------------+---------+--------------------+------+--------+----------+------+------+---------------+-----+-----+---------+----+---------+---------+------+------------------+------------+--------------------+------+
|cycle|         fectransid|   contribid|           contrib|  recipid|             orgname|ultorg|realcode|      date|amount|street|           city|state|  zip|recipcode|type|   cmteid|  otherid|gender|         microfilm|  occupation|            employer|source|
+-----+-------------------+------------+------------------+---------+--------------------+------+--------+----------+------+------+---------------+-----+-----+---------+----+---------+---------+------+------------------+------------+--------------------+------+
| 2018|2103020171461527012|j1002215457 |   SCHULTZ, ANDREW|N00027522|  2 Building Lbj Ltd|      |   Y4000|2017-07-14|  2000|      |        ROWLETT|   TX|75030|       RI| 15 |C00494229|         |     M|2017102002003

In [14]:
sc.sql('SELECT contribid, COUNT(DISTINCT(contrib)) AS naliases FROM pq_crp_indivs18 GROUP BY contribid SORT BY naliases DESC').limit(10).show()

+------------+--------+
|   contribid|naliases|
+------------+--------+
|p0004075947@|       9|
|U00000034401|       8|
|h1001190515 |       6|
|i3003641636 |       6|
|f90002440251|       6|
|n0001438182 |       5|
|h10012043821|       5|
|Y0000000455L|       5|
|h1001478607 |       5|
|m0001582924 |       5|
+------------+--------+



In [15]:
sc.sql("SELECT * FROM pq_crp_indivs18 WHERE contribid='p0004075947@'").limit(10).show()

+-----+-------------------+------------+--------------------+---------+-------+------+--------+----------+------+------+-----------------+-----+-----+---------+----+---------+-------+------+------------------+----------+--------+------+
|cycle|         fectransid|   contribid|             contrib|  recipid|orgname|ultorg|realcode|      date|amount|street|             city|state|  zip|recipcode|type|   cmteid|otherid|gender|         microfilm|occupation|employer|source|
+-----+-------------------+------------+--------------------+---------+-------+------+--------+----------+------+------+-----------------+-----+-----+---------+----+---------+-------+------+------------------+----------+--------+------+
| 2018|4022220181514484332|p0004075947@|    BRINKMAN, AUDREY|N00033842|Retired|      |   J1100|2017-10-23|   105|      |THIEF RIVER FALLS|   MN|56701|       RI| 15 |C00505776|       |     F|201801309090940377|   RETIRED|    NONE| Gen  |
| 2018|4022220181514484334|p0004075947@|    BRINKMAN

In [8]:
zipCmte = sc.sql('SELECT SUM(amount) AS total_amount, zip, cmteid FROM pq_crp_indivs18 GROUP BY zip, cmteid SORT BY total_amount DESC')

In [9]:
zipCmte.registerTempTable('zip_cmte_amt')

In [11]:
sc.sql('SELECT * FROM zip_cmte_amt').limit(10).show()

+------------+-----+---------+
|total_amount|  zip|   cmteid|
+------------+-----+---------+
|      734900|65085|C00075820|
|      534257|94583|C00035006|
|      450073|90274|C00003418|
|      215112|20005|C00562777|
|      193100|46536|C00075820|
|      169077|07043|C00640003|
|      167424|77056|C00429662|
|      156437|94103|C00401224|
|      123750|53217|C00647164|
|      122000|10023|C00489799|
+------------+-----+---------+



In [12]:
sc.sql('SELECT * FROM pq_crp_pac_other18').limit(10).show()

+-----+-------------------+---------+--------------------+--------------------+----------+-----+-----+--------------------+--------+----------+--------+---------+-----+---------+---------+-------------+-----+------+---+------------------+----+--------+------+
|cycle|           fecrecno|  filerid|           donorcmte|    contriblendtrans|      city|state|  zip|           fecoccemp|primcode|      date|  amount|  recipid|party|  otherid|recipcode|recipprimcode|amend|report| pg|         microfilm|type|realcode|source|
+-----+-------------------+---------+--------------------+--------------------+----------+-----+-----+--------------------+--------+----------+--------+---------+-----+---------+---------+-------------+-----+------+---+------------------+----+--------+------+
| 2018|1010320180036112556|C00637983|   Nardolillo, Bobby|ROBERT A NARDOLLI...|    GREENE|   RI|02827|NARDOLILLO FUNERA...|   Z1100|2017-03-03|  2000.0|N00040819|    R|S8RI00110|       RC|        Z1100|    A|   Q2 |  P|2