In [1]:
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, Column, Integer, String, Float
import datetime as dt
import pandas as pd
from config import db_pw

# First Glance: CSV-to-DataFrame

In [2]:
donor_df = pd.read_csv('data/Donor_Data_wip.csv').dropna(axis=1, how='all')

In [3]:
donor_df.head()

Unnamed: 0,cand_nm,contbr_st,contb_receipt_amt,contb_receipt_dt
0,"Rubio, Marco",20,175.0,15-Mar-16
1,"Rubio, Marco",30,25.0,16-Mar-16
2,"Rubio, Marco",AE,100.0,20-Feb-16
3,"Rubio, Marco",AE,200.0,10-Mar-16
4,"Rubio, Marco",AE,100.0,8-Mar-16


In [4]:
records_num = len(donor_df['cand_nm'])
cand_num = len(list(donor_df['cand_nm'].unique()))
state_num = len(list(donor_df['contbr_st'].unique()))
cands = list(donor_df['cand_nm'].unique())

print(f"""
    Number of Records: {records_num}
    Number of Candidates: {cand_num}
    Number of States: {state_num}
    Candidates: {cands}
    """)


    Number of Records: 1048563
    Number of Candidates: 8
    Number of States: 85
    Candidates: ['Rubio, Marco', 'Santorum, Richard J.', 'Perry, James R. (Rick)', 'Carson, Benjamin S.', "Cruz, Rafael Edward 'Ted'", nan, 'Paul, Rand', 'Clinton, Hillary Rodham']
    


### First Glance Summary
It appears that while we have over a million records, which is nice, there's some question as to the validity of the data, and how it was sourced. Firstly, we only have 8 total candidates only one of which ran on the Democratic ticket. Additionally, the 7 remaining Republican candidates do not reflect the entire field as it ran. It may be enough for estimation purposes, but the results of any analysis done with incomplete data such as this may be suspect.

Further, it appears that donors may have had to voluntarily self-identify their State without any reconciliation done on the part of the campaigns. These records indicate a total of 85 different state entries were collected, which is more than the combined list of all US States and Territories.

---


# CSV-to-DB

Running the code below requires a local postgres installation and and active database (in this case one named 'election16'. It will create a 'donations' table and write the DataFrame to the DB. Be prepared to wait as it will take time to write the DataFrame to SQL.

In [5]:
Base = declarative_base()
# db_pw can be entered manually here, however, this script is written to import that variable from a config.py file. Either works.
engine = create_engine(f'postgresql://postgres:{db_pw}@localhost:5432/election16')

  """)


In [6]:
class Donations(Base):
    __tablename__ = 'donations'
    id = Column(Integer, primary_key=True)
    cand_nm = Column(String(255))
    contbr_st = Column(String(255))
    contbr_amt = Column(Float)
    contbr_dt = Column(String(255))

In [7]:
session = Session(bind=engine)
Base.metadata.create_all(engine)

In [8]:
session.rollback()

In [9]:
index_label = ['cand_nm','contbr_st','contbr_amt','contbr_dt']
donor_df.to_sql('donations', engine, if_exists='replace', index=False, index_label=index_label)