In [1]:
import pandas as pd
import numpy as np
import nltk
import pyspark
import matplotlib.pyplot as plt
import os.path as osp
import os
import shutil

%matplotlib notebook

In [2]:
CWD = '/home/matt/etl'
os.chdir(CWD)
!ls

open-secrets  open-secrets.ipynb


Move all the data into the "open-secrets" folder

In [3]:
# !mkdir -p open-secrets/CampaignFin18
# !mv CampaignFin18.zip open-secrets/CampaignFin18
os.chdir('open-secrets/CampaignFin18')

In [4]:
# !unzip CampaignFin18.zip

In [5]:
!cat CF_2018_Read_Me.txt

Record Count - 7/13/18

Candidates
----------
7237

Committees
----------
18100

PACs
----
191318

Pac_Other
---------
106795

Individual
----------
10677360


# `cands18`: data processing example

In [6]:
import pyspark
from pyspark.sql.types import StringType, IntegerType, StructType, StructField, FloatType, BooleanType, DateType

`cands18` schema string: https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20Candidates%20Data.htm

In [7]:
cands18SchemaStr = [
'''Cycle
Last year (even year) of the federal two year election cycle
Text (4)
FEC''',

'''FECCandID
Assigned by FEC and selected by CRP as the active, should multiples exist.
Text (9)
FEC''',

'''CID
Unique identifier for each candidate. Every candidate should have one and only one CID throughout all cycles. Recipid for candidates is based on CID.
Text (9)
CRP''',

'''FirstLastP
Candidate name in format of firstname lastname and party in parens, like Steve Kagen (D)
Text (50)
CRP''',

'''Party
The party of the candidate. "D" for Democratic, "R" for Republican", "I" for Independent, "L" for Libertarian", "3" for other third party and "U" for Unknown.
Text (1)
CRP''',

'''DistIDRunFor
Four character identifier of the office sought by the candidate. For congressional races, the first two characters are the state and the next two are the district for House candidates and "S1" or "S2" for Senate candidates. "PRES" indicates a presidential candidate.
Text (4)
CRP''',

'''DistIDCurr
Four character identifier of the office currently held (if any) by the candidate. For House members, the first two characters are the state and the next two are the district. For Senators the first two characters are the state and the last two characters are "S1" or "S2". "PRES" indicates a presidential candidate. For non-incumbents, this field is blank. If a member of Congress dies or leaves office, this field should become blank. This field is frozen on election day. For cycles prior to the current cycle, DistidCurr reflects office held on Election Day of the Cycle.
Text (4)
CRP''',

'''CurrCand
This field indicates whether the candidate is currently running for federal office - "Y" means yes, otherwise this field is blank. If a candidate loses a primary or drops out of the race, this field becomes blank. This field is frozen on Election Day, and thus for previous cycles can be used to show the candidate who ran in the general election.
Text (1)
CRP''',

'''CycleCand
This field indicates whether the candidate ever ran for federal office during the cycle in question. Like CurrCand, "Y" means yes and blank means no. This field should be "Y" for any candidate who filed to run for office or otherwise formally declared intention to run. This does NOT change if the candidate drops out or loses a primary. Be aware that we've tightened the definition in recent cycles - for older data, CycleCand is likely to cast a broader net. Also note that incumbents are usually assumed to be running for re-election and get a "Y" in CycleCand unless there is evidence to the contrary.
Text (1)
CRP''',

'''CRPICO
Identifies type of candidate - "I" is incumbent, "C" is challenger, "O" is open seat. This may be blank if the candidate is neither a member of Congress nor running this cycle. Note this is based on the office sought. A House incumbent running for the Senate would have a CRPICO of "C" or "O", not "I."
Text (1)
CRP''',

'''RecipCode
A two-character code defining the type of candidate. The first character is party ("D" for Democratic, "R" for Republican, "3" for Independent or third party, "U" for Unknown.) The second character is "W" for Winner, "L" for Loser, "I" for incumbent, "C" for Challenger, "O" for "Open Seat", and "N" for Non-incumbent. Incumbent, Challenger and Open Seat are based on CRPICO. "N" is reserved for candidates that are neither in office nor running during the cycle in question. This lives in dbo_CandsCRP.
Text (2)
CRP''',

'''NoPacs
Indicates whether candidate has publicly committed to forego contributions from PACs
Text (1)
CRP'''
]

Define helper class `Schema`

In [8]:
import time
import datetime

In [9]:
def now():
    return datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f')

In [27]:
class Schema(object):
    
    @classmethod
    def __init__(self, schema_str):
        self.column_names = None
        self._initialized = False
        self._column_dt_formats = {}
        
        self._schema_str = schema_str
        self.process_schema()
    
    @classmethod
    def _check_init(self):
        if not self._initialized:
            raise Exception('Schema not initialized')
    
    @staticmethod
    def spark_dtype(col_info):
        dtype = col_info
        if hasattr(col_info, '__get__'):
            dtype = col_info['dataType']

        if Schema._is_text_datatype(dtype):
            return StringType()
        elif Schema._is_integer_datatype(dtype):
            return IntegerType()
        elif Schema._is_float_datatype(dtype):
            return FloatType()
        elif Schema._is_bool_datatype(dtype):
            return BooleanType()
        elif Schema._is_datetime_datatype(dtype):
            return DateType()
        else:
            raise Exception(f'Unknown datatype: {dtype}')
    
    @staticmethod
    def pandas_dtype(col_info):
        dtype = col_info
        if hasattr(col_info, '__get__'):
            dtype = col_info['dataType']

        if Schema._is_text_datatype(dtype):
            return 'str'
        elif Schema._is_integer_datatype(dtype):
            return 'int'
        elif Schema._is_float_datatype(dtype):
            return 'float'
        elif Schema._is_bool_datatype(dtype):
            return 'bool'
        elif Schema._is_datetime_datatype(dtype):
            return 'str'  # Pandas doesn't care about datetimes in the schema string, you have to tell it about them separately.
        else:
            raise Exception(f'Unknown datatype: {dtype}')
            
    @staticmethod
    def hive_dtype(col_info):
        dtype = col_info
        if hasattr(col_info, '__get__'):
            dtype = col_info['dataType']

        if Schema._is_text_datatype(dtype):
            return 'STRING'
        elif Schema._is_integer_datatype(dtype):
            return 'INTEGER'
        elif Schema._is_float_datatype(dtype):
            return 'DOUBLE'
        elif Schema._is_bool_datatype(dtype):
            return 'BOOLEAN'
        elif Schema._is_datetime_datatype(dtype):
            return 'DATE'
        else:
            raise Exception(f'Unknown datatype: {dtype}')
    
    @classmethod
    def process_schema(self):
        if self._initialized:
            raise Exception('Schema already initialized')
        
        self.parsed_schema = [{
            'name': self.add_name(name),
            'description': description,
            'dataType': dataType,
            'source': source,
            'sparkDtype': Schema.spark_dtype(dataType),
            'pandasDtype': Schema.pandas_dtype(dataType),
            'hiveDtype': Schema.hive_dtype(dataType)
        } for (name, description, dataType, source) in map(lambda c: c.split('\n'), self._schema_str)]
        
        self._initialized = True
    
    @classmethod
    def add_name(self, col_name):
        if self.column_names is None:
            self.column_names = [col_name]
        else:
            self.column_names.append(col_name)
        return col_name
    
    @classmethod
    def register_column_datetime_formats(self, formats):
        if formats:
            for col, fmt in formats.items():
                self._column_dt_formats[col] = fmt
        
    @classmethod
    def get_pandas_schema(self):
        self._check_init()
        return { col['name']: col['pandasDtype'] for col in self.parsed_schema }
    
    @classmethod
    def get_spark_dtypes(self):
        self._check_init()
        return StructType([ StructField(col['name'], col['sparkDtype'], True) for col in self.parsed_schema ])
    
    @classmethod
    def get_dt_colnames(self):
        self._check_init()
        return list(filter(lambda c: Schema._is_datetime_datatype(c['dataType']), self.parsed_schema))
    
    @classmethod
    def hive_ddl(self, tableName, externalTableLocation, external=True):
        self._check_init()
        columnData = []
        for c in self.parsed_schema:
            columnStr = "`{}` {} COMMENT '{}, orig-datatype {}, source {}'".format(c['name'],
                                                                                 c['hiveDtype'],
                                                                                 c['description'].replace("'", "").replace('"', ''),
                                                                                 c['dataType'].replace("'", "").replace('"', ''),
                                                                                 c['source'].replace("'", "").replace('"', ''))
            columnData.append(columnStr)
        
        ddl = """
DROP TABLE {tableName};
        
CREATE{isExternal} TABLE {tableName}
(
    {columnPart}
)
COMMENT 'Auto-generated schema at {now} from {externalTableLocation}'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   "separatorChar" = ",",
   "quoteChar"     = "|"
)
STORED AS TEXTFILE
LOCATION '{externalTableLocation}';

""".format(isExternal=' EXTERNAL' if external else '',
           tableName=tableName,
           columnPart=',\n    '.join(columnData),
           now=now(),
           externalTableLocation=externalTableLocation)
        return ddl
    
    @classmethod
    def hive_ddl_parquet(self, tableName, originalTableName):
        """
        Create a new table by selecting data out of an existing (external) table and inserting
        into a parquet table managed by Hive.
        """
        self._check_init()
        selectData = []
        alterTableStmtsList = []
        for c in self.parsed_schema:
            dtype = c['hiveDtype']
            if dtype == 'STRING':
                columnStr = "`{name}` AS `{name}`".format(name=c['name'])
            elif dtype == 'INTEGER':
                columnStr = "CAST(`{name}` AS {dtype}) AS `{name}`".format(name=c['name'], dtype=dtype)
            elif dtype == 'DOUBLE':
                columnStr = "CAST(`{name}` AS {dtype}) AS `{name}`".format(name=c['name'], dtype=dtype)
            elif dtype == 'BOOLEAN':
                columnStr = "CAST(`{name}` AS {dtype}) AS `{name}`".format(name=c['name'], dtype=dtype)
            elif dtype == 'DATE':
                columnStr = "TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(`{name}`, '{dtConvStr}'))) AS `{name}`".format(name=c['name'],
                                                                                            dtConvStr=self._column_dt_formats[c['name']])
            
            comment = c['description'].replace("'", "").replace('"', '')
            alterTableStmt = "ALTER TABLE {tableName} CHANGE `{name}` `{name}` {dtype} COMMENT '{comment}';".format(tableName=tableName,
                                                                                                                    name=c['name'],
                                                                                                                    dtype=dtype,
                                                                                                                    comment=comment)
            
            selectData.append(columnStr)
            alterTableStmtsList.append(alterTableStmt)
            
        ddl = """
DROP TABLE {tableName};
        
CREATE TABLE {tableName} 
COMMENT 'Auto-generated schema at {now} from {originalTableName}' 
STORED AS PARQUET 
AS SELECT
    {selectStatement}
FROM {originalTableName};

{alterTableStmts}
""".format(tableName=tableName,
           originalTableName=originalTableName,
           selectStatement=',\n    '.join(selectData),
           now=now(),
           alterTableStmts='\n'.join(alterTableStmtsList)
           )
        return ddl
    
    
    @staticmethod
    def _is_text_datatype(dtype):
        return dtype.lower().startswith('text')
    
    @staticmethod
    def _is_integer_datatype(dtype):
        return dtype.lower().startswith('integer')

    @staticmethod
    def _is_float_datatype(dtype):
        if 'precision' in dtype.lower():
            return True
        elif 'float' in dtype.lower():
            return True
        return False

    @staticmethod
    def _is_bool_datatype(dtype):
        return dtype.lower().startswith('bit')

    @staticmethod
    def _is_datetime_datatype(dtype):
        if 'mm/dd/yyyy' in dtype.lower():
            return True
        elif 'date' in dtype.lower():
            return True
        return False

In [11]:
cands18Schema = Schema(cands18SchemaStr)
cands18 = pd.read_csv('cands18.txt', quotechar='|', delimiter=',', 
                      parse_dates=cands18Schema.get_dt_colnames(), infer_datetime_format=True,
                      header=None, dtype=cands18Schema.get_pandas_schema(), names=cands18Schema.column_names)

cands18.head(3)

Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party,DistIDRunFor,DistIDCurr,CurrCand,CycleCand,CRPICO,RecipCode,NoPacs
0,2018,H6FL08221,N00028542,D J Mauro (I),I,FL08,,,,,3N,
1,2018,H6NC09200,N00035491,Mark Harris (R),R,NC09,,,,O,RO,
2,2018,H4NE01163,N00035619,Dennis Crawford (D),D,NE01,,,Y,C,DL,


Put it in Hive

In [64]:
!hdfs dfs -mkdir -p /staging/cands18
!hdfs dfs -put cands18.txt /staging/cands18/cands18.txt

2018-10-07 14:38:38,961 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2018-10-07 14:38:40,886 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [68]:
with open('cands18.sql', 'w') as f:
    f.write(cands18Schema.hive_ddl(tableName='cands18', externalTableLocation='/staging/cands18/'))

In [69]:
!/opt/hive/bin/beeline -u jdbc:hive2://cap1:10000 -n matt -p anything -f cands18.sql

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://cap1:10000
Connected to: Apache Hive (version 3.1.0)
Driver: Hive JDBC (version 3.1.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://cap1:10000> 
0: jdbc:hive2://cap1:10000> DROP TABLE cands18;
INFO  : Compiling command(queryId=hadoop_20181007144738_80b45164-a632-45c3-a780-a2451c8038d1): DROP TABLE cands18
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Returning Hive schema: Schema(fieldSchemas:null, 

> 

Verify that we can read the table

In [11]:
sc = pyspark.sql.SparkSession.Builder()\
        .master('yarn')\
        .appName('open-secrets-etl')\
        .enableHiveSupport()\
        .getOrCreate()

Write back to Hive warehouse as parquet format:

In [71]:
df = sc.sql('SELECT * FROM cands18')

In [73]:
df.write.saveAsTable(name='pqcands18', format='parquet', mode='overwrite')

In [75]:
!hdfs dfs -ls /user/hive/warehouse

2018-10-07 14:51:35,017 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 1 items
drwxr-xr-x   - matt supergroup          0 2018-10-07 14:51 /user/hive/warehouse/pqcands18


How much faster is it than reading CSVs from Hive?

In [76]:
%timeit sc.sql('SELECT COUNT(*) FROM cands18')

16.7 ms ± 2.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [77]:
%timeit sc.sql('SELECT COUNT(*) FROM pqcands18')

10.7 ms ± 705 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


About 50% speedup! Now we just need to clean up

In [80]:
sc.sql('DESCRIBE TABLE pqcands18').collect()

[Row(col_name='cycle', data_type='string', comment='from deserializer'),
 Row(col_name='feccandid', data_type='string', comment='from deserializer'),
 Row(col_name='cid', data_type='string', comment='from deserializer'),
 Row(col_name='firstlastp', data_type='string', comment='from deserializer'),
 Row(col_name='party', data_type='string', comment='from deserializer'),
 Row(col_name='distidrunfor', data_type='string', comment='from deserializer'),
 Row(col_name='distidcurr', data_type='string', comment='from deserializer'),
 Row(col_name='currcand', data_type='string', comment='from deserializer'),
 Row(col_name='cyclecand', data_type='string', comment='from deserializer'),
 Row(col_name='crpico', data_type='string', comment='from deserializer'),
 Row(col_name='recipcode', data_type='string', comment='from deserializer'),
 Row(col_name='nopacs', data_type='string', comment='from deserializer')]

Well, we lost our comments, but they are still with us in our hearts. And in the `.sql` file. Internet doesn't show any easy ways of preserving them so we won't worry about this, although we could add them back with an `ALTER TABLE` statment or two. Now for clean up:

In [81]:
sc.sql('DROP TABLE cands18')
!hdfs dfs -rm -r -f /staging/cands18
#!rm -f cands18.txt

2018-10-07 14:57:15,745 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted /staging/cands18


Verify that the table is deleted and that Hive no longer tracks metadata

In [82]:
!/opt/hive/bin/beeline -u jdbc:hive2://cap1:10000 -n matt -p anything -e 'show tables;'

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://cap1:10000
Connected to: Apache Hive (version 3.1.0)
Driver: Hive JDBC (version 3.1.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
INFO  : Compiling command(queryId=hadoop_20181007145744_c18ef251-59dd-4ce9-89a9-801e42ce216b): show tables
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:tab_name, type:string, comment:from deserializer)], properties:null)
INF

Now we need to turn this into a repeatable, hands-off process

# Standardized hands-off ETL function

In [12]:
def etl_text_file(text_file_path, schema_str, table_name, data_source_name, column_datetime_formats=None):
    schema = Schema(schema_str)
    schema.register_column_datetime_formats(column_datetime_formats)
    textFileName = osp.split(text_file_path)[1]
    stagingFolder = f'/staging/{table_name}'
    !hdfs dfs -mkdir -p {stagingFolder}
    !hdfs dfs -put {text_file_path} {stagingFolder}/{textFileName}
    with open(f'{table_name}.sql', 'w') as f:
        f.write(schema.hive_ddl(tableName=table_name, externalTableLocation=stagingFolder))
    !/opt/hive/bin/beeline -u jdbc:hive2://cap1:10000 -n matt -p anything -f {table_name}.sql
    
    # Convert to the proper schema and write as parquet
    finalTableName = f'pq_{data_source_name}_{table_name}'
    with open(f'{finalTableName}.sql', 'w') as f:
        f.write(schema.hive_ddl_parquet(tableName=finalTableName, originalTableName=table_name))
    !/opt/hive/bin/beeline -u jdbc:hive2://cap1:10000 -n matt -p anything -f {finalTableName}.sql
    
    # clean up -- drop the intermediate (text file) table and delete staged data from HDFS
    sc.sql(f'DROP TABLE {table_name}')
    !hdfs dfs -rm -r -f {stagingFolder}
    
    print('done')

# `cands18`

In [149]:
etl_text_file('cands18.txt', cands18SchemaStr, 'cands18', 'crp')

2018-10-07 18:29:32,942 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2018-10-07 18:29:34,932 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://cap1:10000
Connected to: Apache Hive (version 3.1.0)
Driver: Hive JDBC (version 3.1.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://cap1:10000> 
0: jdbc:hive2://cap1:10000> DRO

In [13]:
sc.sql('SELECT * FROM pq_crp_cands18').limit(10).show()

+-----+---------+---------+--------------------+-----+------------+----------+--------+---------+------+---------+------+
|cycle|feccandid|      cid|          firstlastp|party|distidrunfor|distidcurr|currcand|cyclecand|crpico|recipcode|nopacs|
+-----+---------+---------+--------------------+-----+------------+----------+--------+---------+------+---------+------+
| 2018|H6FL08221|N00028542|       D J Mauro (I)|    I|        FL08|          |        |         |      |       3N|      |
| 2018|H6NC09200|N00035491|     Mark Harris (R)|    R|        NC09|          |        |         |     O|       RO|      |
| 2018|H4NE01163|N00035619| Dennis Crawford (D)|    D|        NE01|          |        |        Y|     C|       DL|      |
| 2018|H0IN01127|N00041294| Arturas Kerelis (D)|    D|        IN01|          |        |         |      |       DN|      |
| 2018|H8AL02171|N00041295|     Barry Moore (R)|    R|        AL02|          |        |        Y|     C|       RL|      |
| 2018|H8AL04078|N000412

In [14]:
sc.sql('DESCRIBE TABLE pq_crp_cands18').show()

+------------+---------+--------------------+
|    col_name|data_type|             comment|
+------------+---------+--------------------+
|       cycle|   string|Last year (even y...|
|   feccandid|   string|Assigned by FEC a...|
|         cid|   string|Unique identifier...|
|  firstlastp|   string|Candidate name in...|
|       party|   string|The party of the ...|
|distidrunfor|   string|Four character id...|
|  distidcurr|   string|Four character id...|
|    currcand|   string|This field indica...|
|   cyclecand|   string|This field indica...|
|      crpico|   string|Identifies type o...|
|   recipcode|   string|A two-character c...|
|      nopacs|   string|Indicates whether...|
+------------+---------+--------------------+



# `cmtes18`

`cmtes18` schema string: https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20Candidates%20Data.htm

In [20]:
cmtes18SchemaStr = [
'''Cycle
Last year (even year) of the federal 2-year election cycle
Text (4)
FEC''',

'''CmteID
Unique ID given by FEC to each committee.
Text (9)
FEC''',

'''PACShort
Standardized committee name based on PAC's sponsor.
Text (50)
CRP''',

'''Affiliate
Usually blank. For leadpacs, shows the sponsoring member.
Text (50)
CRP''',

'''Ultorg
The standardized parent organization for the organization listed in the PACShort field. If there is no parent identified, this field will be equal to PACShort.
Text (50)
CRP''',

'''RecipID
For candidate committees this will be the candidate's CID. Otherwise, it will be the same as CmteID.
Text (9)
CRP''',

'''RecipCode
A two-character code defining the type of recipient. For candidates, the first character is party ("D" for Democratic, "R" for Republican, "3" for Independent, Libertarian or third party, "U" for Unknown.) The second character is "W" for Winner, "L" for Loser, "I" for incumbent, "C" for Challenger, "O" for "Open Seat", and "N" for Non-incumbent. "N" is reserved for candidates that are neither in office nor running during the cycle in question. For party committees, the first character is party and the second character is "P." For PACs, the first character is "P" and for outside spending groups, "O". For both, the second character is "B" for Business, "L" for Labor", "I" for Ideological, "O" for "Other" and "U" for unknown.
Text (2)
CRP''',

'''FECCandID
Unique ID given to candidates by FEC.
Text (9)
FEC''',

'''Party
(D,R,3,I,L) Will be null or empty if committee is not a party, joint fundraising, leadership or candidate committee.
Text (1)
CRP/FEC''',

'''PrimCode
The standard five character code identifying the committee's industry or ideology.
Text (5)
CRP''',

'''Source
Indicates how the PrimCode was determined.
Text (5)
CRP''',

'''Sensitive
If "Y", the committee has significant business in multiple industries, some of which fall under the jurisdiction of specific congressional committees.
Text (1)
CRP''',

'''Foreign
This is a bit field. Off/False indicate that the company is not owned by a foreign entity. Those that are owned by a foreign entity are on/True, sometimes "-1".
Bit
CRP''',

'''Active
Determines if cmte is active in the cycle - 0 is no and 1 is yes
Integer
CRP'''
]

In [31]:
etl_text_file('cmtes18.txt', cmtes18SchemaStr, 'cmtes18', 'crp')

2018-10-07 18:44:14,608 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2018-10-07 18:44:16,543 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
put: `/staging/cmtes18/cmtes18.txt': File exists
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://cap1:10000
Connected to: Apache Hive (version 3.1.0)
Driver: Hive JDBC (version 3.1.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hiv

In [15]:
sc.sql('SELECT * FROM pq_crp_cmtes18').limit(10).show()

+-----+---------+--------------------+--------------------+--------------------+---------+---------+---------+-----+--------+------+---------+-------+------+
|cycle|   cmteid|            pacshort|           affiliate|              ultorg|  recipid|recipcode|feccandid|party|primcode|source|sensitive|foreign|active|
+-----+---------+--------------------+--------------------+--------------------+---------+---------+---------+-----+--------+------+---------+-------+------+
| 2018|C00000018|                    |                    |                    |C00000018|         |H8TX22313|     |        |      |         |  false|     0|
| 2018|C00000059|      Hallmark Cards|                    |      Hallmark Cards|C00000059|       PB|         |     |   C1400| WebAM|        n|  false|     1|
| 2018|C00000422|American Medical ...|American Medical ...|American Medical ...|C00000422|       PB|         |     |   H1100| AFP88|        n|  false|     1|
| 2018|C00000489| Teamsters Local 886|     Teamsters

In [16]:
sc.sql('DESCRIBE TABLE pq_crp_cmtes18').show()

+---------+---------+--------------------+
| col_name|data_type|             comment|
+---------+---------+--------------------+
|    cycle|   string|Last year (even y...|
|   cmteid|   string|Unique ID given b...|
| pacshort|   string|Standardized comm...|
|affiliate|   string|Usually blank. Fo...|
|   ultorg|   string|The standardized ...|
|  recipid|   string|For candidate com...|
|recipcode|   string|A two-character c...|
|feccandid|   string|Unique ID given t...|
|    party|   string|(D,R,3,I,L) Will ...|
| primcode|   string|The standard five...|
|   source|   string|Indicates how the...|
|sensitive|   string|If Y, the committ...|
|  foreign|  boolean|This is a bit fie...|
|   active|      int|Determines if cmt...|
+---------+---------+--------------------+



# `indivs18`

`indivs18` schema string: https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20Individual%20Contribution%20Data.htm

In [17]:
indivs18SchemaStr = [
'''Cycle
Last year (even year) of the federal 2-year election cycle
Text (4)
FEC''',

'''FECTransID
A unique record identifier within a given cycle.
Text (19)
FEC''',

'''ContribID
A unique identifier for individual donors.  Family groups match on first 11 chars
Text (12)
CRP''',

'''Contrib
The name of the contributor, usually in the format Last name, First Name.
Text (50)
FEC''',

'''RecipID
The recipient's id number. If the contribution is to a candidate this will be the candidate's unique candidate id number. Otherwise, it will be the FEC committee id number.
Text (9)
CRP''',

'''Orgname
The standardized organization name for the contributor. This is usually based on the donor's employer. The donor may not have an income producing occupation (e.g. homemaker)
Text (50)
CRP''',

'''UltOrg
The standardized parent organization for the organization listed in the Orgname field. If there is no parent identified, this field will be blank or null.
Text (50)
CRP''',

'''RealCode
The standard five character code identifying the donor's industry or ideology. Usually based on Orgname (e.g., the orgname "Microsoft Corp" would normally get realcode C5120 for computer software.)
Text (5)
CRP''',

'''Date
The reported date of the contribution.
MM/DD/YYYY
FEC''',

'''Amount
The amount contributed. This will be negative for refunds.
Integer
FEC''',

'''Street
2000+ cycle only, and only for committees that file electronically
Text (40)
FEC''',

'''City
The donor's city. This could be based on a home address or an employer's address. 
Text (30)
FEC''',

'''State
The donor's state. This could be based on a home address or an employer's address.
Text (2)
FEC''',

'''Zip
The donor's zip code. This could be based on a home address or an employer's address.
Text (5)
FEC''',

'''RecipCode
A two-character code defining the type of recipient. For candidates, the first character is party ("D" for Democratic, "R" for Republican, "3" for Independent, Libertarian or third party, "U" for Unknown.) The second character is "W" for Winner, "L" for Loser, "I" for incumbent, "C" for Challenger, "O" for "Open Seat", and "N" for Non-incumbent. "N" is reserved for candidates that are neither in office nor running during the cycle in question. For party committees, the first character is party and the second character is "P." For PACs, the first character is "P" and the second character is "B" for Business, "L" for Labor", "I" for Ideological, "O" for "Other" and "U" for unknown.
Text (2)
CRP''',

'''Type
The transaction type code for the contribution. 15 is a contribution, 15e is an earmarked contribution (made through a group such as Club for Growth or Emily's List), 15j is a contribution through a joint fund raising committee and 22y is a refund. "10" indicates "soft" or nonfederal money for the 2002 cycle and earlier. For the 2004 cycle and later type "10" indicates Levin funds.or outside spending.
Text (3)
FEC''',

'''CmteID
The committee id number for the recipient. Note that a candidate can have more than one committee — this field indicates the exact committee receiving the contribution.
Text (9)
FEC''',

'''OtherID
The committee id number for the intermediary party to earmarked contributions.
Text (9)
FEC''',

'''Gender
The donor's gender. Can also be "U" if unknown or "N" if the name is ambiguous.
Text (1)
CRP''',

'''Microfilm
Refers to specific page of FEC report images on which this transaction appears.
Text (11)
FEC''',

'''Occupation
The donor's disclosed occupation.
Text (38)
FEC''',

'''Employer
The donor's s disclosed employer
Text (38)
FEC''',

'''Source
Indicates how the RealCode was determined — see the How to Use Source in the OpenSecrets OpenData Guide
Text (5)
<NOT SPECIFIED>'''
]

In [28]:
etl_text_file('indivs18.txt', indivs18SchemaStr, 'indivs18', 'crp', column_datetime_formats={'Date': 'MM/dd/yyyy'})

2018-10-07 19:47:23,035 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2018-10-07 19:47:24,971 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://cap1:10000
Connected to: Apache Hive (version 3.1.0)
Driver: Hive JDBC (version 3.1.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://cap1:10000> 
0: jdbc:hive2://cap1:10000> DRO

In [29]:
sc.sql('SELECT * FROM pq_crp_indivs18').limit(10).show()

+-----+-------------------+------------+-------------------+---------+-------+------+--------+----------+------+------+---------+-----+-----+---------+----+---------+---------+------+------------------+------------------+------------+------+
|cycle|         fectransid|   contribid|            contrib|  recipid|orgname|ultorg|realcode|      date|amount|street|     city|state|  zip|recipcode|type|   cmteid|  otherid|gender|         microfilm|        occupation|    employer|source|
+-----+-------------------+------------+-------------------+---------+-------+------+--------+----------+------+------+---------+-----+-----+---------+----+---------+---------+------+------------------+------------------+------------+------+
| 2018|4021520181504779816|p0003541768 |WUTHRICH, STEPHANIE|C00603084|       |      |   J1200|2017-11-08|     5|      |  ELKHART|   IN|46514|       PI| 15E|C00603084|C00401224|     F|201801319091098155|      NOT EMPLOYED|        NONE| P/PAC|
| 2018|4021520181504779818|p0003

In [30]:
sc.sql('DESCRIBE TABLE pq_crp_indivs18').show()

+----------+---------+--------------------+
|  col_name|data_type|             comment|
+----------+---------+--------------------+
|     cycle|   string|Last year (even y...|
|fectransid|   string|A unique record i...|
| contribid|   string|A unique identifi...|
|   contrib|   string|The name of the c...|
|   recipid|   string|The recipients id...|
|   orgname|   string|The standardized ...|
|    ultorg|   string|The standardized ...|
|  realcode|   string|The standard five...|
|      date|     date|The reported date...|
|    amount|      int|The amount contri...|
|    street|   string|2000+ cycle only,...|
|      city|   string|The donors city. ...|
|     state|   string|The donors state....|
|       zip|   string|The donors zip co...|
| recipcode|   string|A two-character c...|
|      type|   string|The transaction t...|
|    cmteid|   string|The committee id ...|
|   otherid|   string|The committee id ...|
|    gender|   string|The donors gender...|
| microfilm|   string|Refers to 

# `pac_other18`

`pac_other18` schema string: https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20PAC%20to%20PAC%20Data.htm

In [31]:
pac_other18SchemaStr = ['''Cycle
Last year (even year) of the federal 2-year election cycle
Text (4)
FEC''',

'''FECRecNo
A unique record identifier within a given cycle.
Text (19)
FEC''',

'''Filerid
The committee id number for the PAC making the filing. Refers to donor if Type 2* or recipient if Type=1*.
Text (9)
FEC''',

'''DonorCmte
The standardized name for the donor based on the name of the PAC's sponsor.
Text (50)
CRP''',

'''ContribLendTrans
Reported name of the donor if Type=1* or the recipient if Type=2*.
Text (50)
FEC''',

'''City
The donor's city. This could be based on a home address or an employer's address.
Text (30)
FEC''',

'''State
The donor's state. This could be based on a home address or an employer's address.
Text (2)
FEC''',

'''Zip
The donor's zip code. This could be based on a home address or an employer's address.
Text (5)
FEC''',

'''FECOccEmp
The donor's disclosed employer and/or occupation.
Text (38)
FEC''',

'''Primcode
The primary industry/ideological code for the donor PAC's sponsor.
Text (5)
CRP''',

'''Date
The reported date of the contribution.
Date
FEC''',

'''Amount
The amount contributed. This will be negative for refunds.
Float
FEC''',

'''RecipID
The recipient's id number. If the contribution is to a candidate this will be the candidate's unique candidate id number. Otherwise, it will be the FEC committee id number.
Text (9)
CRP''',

'''Party
The party (if any) of the recipient. "D" for Democratic, "R" for Republican", "I" for Independent, "L" for Libertarian", "3" for other third party and "U" for Unknown. This field will be blank or null for PACs other than leadership PACs.
Text (1)
CRP''',

'''Otherid
Committee id for donor if Type=1* or recipient if Type=2*.
Text (9)
FEC''',

'''RecipCode
A two character code defining the type of recipient. For candidates, the first character is party ("D" for Democratic, "R" for Republican, "3" for Independent, Libertarian or third party, "U" for Unknown.) The second character is "W" for Winner, "L" for Loser, "I" for incumbent, "C" for Challenger, "O" for "Open Seat", and "N" for Non-incumbent. "N" is reserved for candidates that are neither in office nor running during the cycle in question. For party committees, the first character is party and the second character is "P." For PACs, the first character is "P" and for outside spending groups, the first character is "O". For both, the second character is "B" for Business, "L" for Labor", "I" for Ideological, "O" for "Other" and "U" for unknown.
Text (2)
CRP''',

'''RecipPrimcode
The industry/ideological code for the recipient - codes beginning with Z1 are candidate committees, codes beginning with Z5 are party committees and codes beginning with J2 are leadership PACs.
Text (5)
CRP''',

'''Amend
Whether the record comes from an amended report
Text (1)
FEC''',

'''Report
The type of report - 1st quarter, year end, etc.
Text (3)
FEC''',

'''PG
Whether the contribution is for a Primary ("P") or General ("G") election.
Text (1)
FEC''',

'''Microfilm
The FEC microfilm record for the contribution
Text (11)
FEC''',

'''Type
The transaction type code for the contribution. 11 is a tribal contribution, 22Z is a contribution refund to a candidate or committee, 24G is a Transfer to an affiliated committee, 24K is a direct contribution, 24R is a election recount disbursement and 24Z is an in kind contribution
Text (3)
FEC''',

'''RealCode
The standard five character code identifying the donor's industry or ideology.Usually based on Primcode. Sometimes a PAC sponsor will have secondary interests which may replace the main realcode depending on recipient. For example, Boeing is primarily Air Transport but has Air Defense interests. Thus Boeing contributions to members of the Armed Services committee would have a realcode of Air Defense.
Text (5)
CRP''',

'''Source
Indicates how the Realcode was determined.
Text (5)
CRP'''
]

In [32]:
!head -n 10 pac_other18.txt

|2018|,|1010320180036112556|,|C00637983|,|Nardolillo, Bobby|,|ROBERT A NARDOLLILO III|,|GREENE|,|RI|,|02827|,|NARDOLILLO FUNERAL HOME|,|Z1100|,03/03/2017,2000.0,|N00040819|,|R|,|S8RI00110|,|RC|,|Z1100|,|A|,|Q2 |,|P|,|201707200200233820|,|16C|,|Z1100|,|PAC  |
|2018|,|1010320180036112568|,|C00637983|,|Bobby for Senate|,|LEADERSHIP CONNECTICUT PAC|,|TRUMBULL|,|CT|,|06611|,,|Z1100|,06/26/2017,280.0,|C00499863|,| |,|C00499863|,|PI|,|J1100|,|A|,|Q2 |,|P|,|201707200200233825|,|24G|,|Z1100|,|PAC  |
|2018|,|1010320180036112716|,|C00443218|,|Wicker Majority Fund|,|WICKER MAJORITY FUND|,|JACKSON|,|MS|,|39205|,,|Z4100|,09/30/2017,3323.0,|N00003280|,|R|,|C00646380|,|RI|,|Z1100|,|A|,|Q3 |,|G|,|201710200200357062|,|18G|,|Z4100|,|PAC  |
|2018|,|1010320180036112854|,|C00443218|,|2017 Senators Classic Cmte|,|2017 SENATORS CLASSIC COMMITTEE|,|ALEXANDRIA|,|VA|,|22314|,,|Z4100|,09/20/2017,9530.0,|N00003280|,|R|,|C00637181|,|RI|,|Z1100|,|A|,|Q3 |,|P|,|201710200200357063|,|18G|,|Z4100|,|PAC  |
|2018|,|101032

In [33]:
etl_text_file('pac_other18.txt', pac_other18SchemaStr, 'pac_other18', 'crp', column_datetime_formats={'Date': 'MM/dd/yyyy'})

2018-10-07 19:52:11,489 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2018-10-07 19:52:13,413 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://cap1:10000
Connected to: Apache Hive (version 3.1.0)
Driver: Hive JDBC (version 3.1.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://cap1:10000> 
0: jdbc:hive2://cap1:10000> DRO

In [34]:
sc.sql('SELECT * FROM pq_crp_pac_other18').limit(10).show()

+-----+-------------------+---------+--------------------+--------------------+----------+-----+-----+--------------------+--------+----------+--------+---------+-----+---------+---------+-------------+-----+------+---+------------------+----+--------+------+
|cycle|           fecrecno|  filerid|           donorcmte|    contriblendtrans|      city|state|  zip|           fecoccemp|primcode|      date|  amount|  recipid|party|  otherid|recipcode|recipprimcode|amend|report| pg|         microfilm|type|realcode|source|
+-----+-------------------+---------+--------------------+--------------------+----------+-----+-----+--------------------+--------+----------+--------+---------+-----+---------+---------+-------------+-----+------+---+------------------+----+--------+------+
| 2018|1010320180036112556|C00637983|   Nardolillo, Bobby|ROBERT A NARDOLLI...|    GREENE|   RI|02827|NARDOLILLO FUNERA...|   Z1100|2017-03-03|  2000.0|N00040819|    R|S8RI00110|       RC|        Z1100|    A|   Q2 |  P|2

In [35]:
sc.sql('DESCRIBE TABLE pq_crp_pac_other18').show()

+----------------+---------+--------------------+
|        col_name|data_type|             comment|
+----------------+---------+--------------------+
|           cycle|   string|Last year (even y...|
|        fecrecno|   string|A unique record i...|
|         filerid|   string|The committee id ...|
|       donorcmte|   string|The standardized ...|
|contriblendtrans|   string|Reported name of ...|
|            city|   string|The donors city. ...|
|           state|   string|The donors state....|
|             zip|   string|The donors zip co...|
|       fecoccemp|   string|The donors disclo...|
|        primcode|   string|The primary indus...|
|            date|     date|The reported date...|
|          amount|   double|The amount contri...|
|         recipid|   string|The recipients id...|
|           party|   string|The party (if any...|
|         otherid|   string|Committee id for ...|
|       recipcode|   string|A two character c...|
|   recipprimcode|   string|The industry/ideo...|


# `pacs18`

`pacs18` schema string: https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20PAC%20to%20Cands%20Data.htm

In [36]:
pacs18SchemaStr = [
'''Cycle
Last year (even year) of the federal 2-year election cycle
Text (4)
FEC''',

'''FECRecNo
A unique record identifier within a given cycle.
Text (19)
FEC''',

'''PACID
The committee id number for the PAC making the contribution.
Text (9)
FEC''',

'''CID
A unique identifier for candidates that is constant throughout cycles.
Text (9)
CRP''',

'''Amount
The amount contributed. This will be negative for refunds.
float
FEC''',

'''Date
The reported date of the contribution.
Date
FEC''',

'''RealCode
The standard five character code identifying the donor's industry or ideology. Usually based on Primcode. Sometimes a PAC sponsor will have secondary interests which may replace the main realcode depending on recipient. For example, Boeing is primarily Air Transport but has Air Defense interests. Thus Boeing contributions to members of the Armed Services committee would have a realcode of Air Defense.
Text (5)
CRP''',

'''Type
The transaction type code for the contribution. 24A is an Independent Expenditure against the candidate, 24C is a coordinated expenditure, 24E is an independent expenditure for the candidate, 24F is a communication cost for the candidate, 24K is a direct contribution, 24N is a communication cost against the candidate and 24Z is an in kind contribution
Text (3)
FEC''',

'''DI
Whether the contribution is direct ("D") or indirect ("I."). Indirect contributions include independent expenditures and communications costs, are not subject to contribution limits and must be made completely independently of the candidate. Indirect contributions can also be against the candidate.
Text (1)
CRP''',

'''FECCandID
FECCandID of recipient candidate
Text (9)
FEC'''
]

In [37]:
!head -n 10 pacs18.txt

|2018|,|4031520181518958016|,|C00150797|,|N00035527|,1000,02/06/2018,|K1200|,|24K|,|D|,|H4AR04048|
|2018|,|4021520181504727622|,|C00343459|,|N00028152|,5000,12/27/2017,|H1130|,|24K|,|D|,|H6CA22125|
|2018|,|4062120181570992000|,|C00005249|,|N00035509|,3000,05/18/2018,|F3300|,|24K|,|D|,|H4IA03115|
|2018|,|4121120171484669562|,|C00449165|,|N00003689|,1000,11/08/2017,|M3100|,|24K|,|D|,|H8OH01043|
|2018|,|4101220171458221652|,|C00171843|,|N00002408|,5000,09/21/2017,|F3400|,|24K|,|D|,|H2SC02042|
|2018|,|4051820181565917536|,|C00035675|,|N00031933|,1000,04/23/2018,|H4300|,|24K|,|D|,|H0MA10082|
|2018|,|4072420171429498467|,|C00035451|,|N00034584|,2000,06/27/2017,|LT100|,|24K|,|D|,|S2ME00158|
|2018|,|4112020171465723593|,|C00033779|,|N00035346|,1000,10/24/2017,|F4100|,|24K|,|D|,|H4GA01039|
|2018|,|4042720181530493912|,|C00502906|,|N00033518|,1500,03/22/2018,|C6200|,|24K|,|D|,|H2GA09150|
|2018|,|4022220181514504696|,|C00542431|,|N00003105|,5000,01/08/2018,|J1100|,|24K|,|D|,|S8TN00337|


In [39]:
etl_text_file('pacs18.txt', pacs18SchemaStr, 'pacs18', 'crp', column_datetime_formats={'Date': 'MM/dd/yyyy'})

2018-10-07 20:09:30,942 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2018-10-07 20:09:32,857 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/hive/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://cap1:10000
Connected to: Apache Hive (version 3.1.0)
Driver: Hive JDBC (version 3.1.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://cap1:10000> 
0: jdbc:hive2://cap1:10000> DRO

In [41]:
sc.sql('SELECT * FROM pq_crp_pacs18').limit(10).show()

+-----+-------------------+---------+---------+------+----------+--------+----+---+---------+
|cycle|           fecrecno|    pacid|      cid|amount|      date|realcode|type| di|feccandid|
+-----+-------------------+---------+---------+------+----------+--------+----+---+---------+
| 2018|4031520181518958016|C00150797|N00035527|1000.0|2018-02-06|   K1200| 24K|  D|H4AR04048|
| 2018|4021520181504727622|C00343459|N00028152|5000.0|2017-12-27|   H1130| 24K|  D|H6CA22125|
| 2018|4062120181570992000|C00005249|N00035509|3000.0|2018-05-18|   F3300| 24K|  D|H4IA03115|
| 2018|4121120171484669562|C00449165|N00003689|1000.0|2017-11-08|   M3100| 24K|  D|H8OH01043|
| 2018|4101220171458221652|C00171843|N00002408|5000.0|2017-09-21|   F3400| 24K|  D|H2SC02042|
| 2018|4051820181565917536|C00035675|N00031933|1000.0|2018-04-23|   H4300| 24K|  D|H0MA10082|
| 2018|4072420171429498467|C00035451|N00034584|2000.0|2017-06-27|   LT100| 24K|  D|S2ME00158|
| 2018|4112020171465723593|C00033779|N00035346|1000.0|2017-1

In [43]:
sc.sql('DESCRIBE TABLE pq_crp_pacs18').show()

+---------+---------+--------------------+
| col_name|data_type|             comment|
+---------+---------+--------------------+
|    cycle|   string|Last year (even y...|
| fecrecno|   string|A unique record i...|
|    pacid|   string|The committee id ...|
|      cid|   string|A unique identifi...|
|   amount|   double|The amount contri...|
|     date|     date|The reported date...|
| realcode|   string|The standard five...|
|     type|   string|The transaction t...|
|       di|   string|Whether the contr...|
|feccandid|   string|FECCandID of reci...|
+---------+---------+--------------------+

