# Data Exploration and QA for first 15 cols for transaction_detail_first_batch.csv

In [4]:
import pandas as pd
import numpy as np

## Notes about columns:

* Address
    * Interesting to create a feature for inside vs outside OR donors
    *    
    
* Address Book Type
    * 8 distinct types:
        * Business Entity
        * Candidate's Immediate Family
        * Individual
        * Labor Organization
        * Other
        * Political Committee
        * Political Party Committee
        * Unregistered Committee
    
* Agent
    * 6 distinct in 32 not NAN
    * Seems to be the broker in charge of the transaction.
    
    
* Aggregate
    * Running total to date?? 
    
* Amount
* Associations
* Check
* CoSigner Obligations
* Description
* Due Date
* Employer Name
* Exam Letter Date
* Filed Date
* In-Kind/Independent Expenditures
    > Mostly `,` values
    
* Interest Rate

In [5]:
# Load transactaction detail
transaction_detail_df = pd.read_csv("../transaction_detail_first_batch.csv", low_memory=False)

In [9]:
for x,y in zip(transaction_detail_df.columns, transaction_detail_df.loc[1]):
    print(x," : ",y)

Address  :  PO Box 523 Salem OR 97308
Address Book Type  :  Business Entity
Agent  :  nan
Aggregate  :  $648.00
Amount  :  $53.00
Associations  :  nan
Check  :  nan
CoSigner Obligations  :  nan
Description  :  nan
Due Date  :  12/30/2010 11:59:00 PM
Employer Name  :  nan
Exam Letter Date  :  12/06/2010
Filed Date  :  12/06/2010 01:16:12 PM
In-Kind/Independent Expenditures  :  , 
Interest Rate  :  nan
Name  :  Oregon Speech Language Hearing Association
Occupation  :  nan
Occupation Letter Date  :  nan
Payer of Personal Expenditure  :  nan
Payment Method  :  nan
Process Status  :  Complete
Purpose  :  nan
Repayment Schedule  :  nan
Transaction Date  :  11/30/2010
Transaction ID  :  968825
Transaction Sub Type  :  Cash Contribution
Transaction Type  :  Contribution


In [5]:
# Create a copy which is a subset of cols to check.
subset_df = transaction_detail_df[transaction_detail_df.columns[:15]].copy()
columns = transaction_detail_df.columns[:15]

In [18]:
# Build basic column info dict.

columns_info_dict = {}
for col in columns:
    num_na = len(subset_df[col][subset_df[col].isna()].copy())
    columns_info_dict[col] = {"perc_NAN" : np.round(num_na/len(subset_df[col]), 3),
                       "num_NAN" : num_na}

In [19]:
columns_info_dict

{'Address': {'num_NAN': 19185, 'perc_NAN': 0.217},
 'Address Book Type': {'num_NAN': 18961, 'perc_NAN': 0.215},
 'Agent': {'num_NAN': 88278, 'perc_NAN': 1.0},
 'Aggregate': {'num_NAN': 20481, 'perc_NAN': 0.232},
 'Amount': {'num_NAN': 0, 'perc_NAN': 0.0},
 'Associations': {'num_NAN': 87218, 'perc_NAN': 0.988},
 'Check': {'num_NAN': 70920, 'perc_NAN': 0.803},
 'CoSigner Obligations': {'num_NAN': 88310, 'perc_NAN': 1.0},
 'Description': {'num_NAN': 70227, 'perc_NAN': 0.795},
 'Due Date': {'num_NAN': 8, 'perc_NAN': 0.0},
 'Employer Name': {'num_NAN': 58346, 'perc_NAN': 0.661},
 'Exam Letter Date': {'num_NAN': 17754, 'perc_NAN': 0.201},
 'Filed Date': {'num_NAN': 0, 'perc_NAN': 0.0},
 'In-Kind/Independent Expenditures': {'num_NAN': 0, 'perc_NAN': 0.0},
 'Interest Rate': {'num_NAN': 88070, 'perc_NAN': 0.997}}

In [16]:
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 15 columns):
Address                             69125 non-null object
Address Book Type                   69349 non-null object
Agent                               32 non-null object
Aggregate                           67829 non-null object
Amount                              88310 non-null object
Associations                        1092 non-null object
Check                               17390 non-null float64
CoSigner Obligations                0 non-null float64
Description                         18083 non-null object
Due Date                            88302 non-null object
Employer Name                       29964 non-null object
Exam Letter Date                    70556 non-null object
Filed Date                          88310 non-null object
In-Kind/Independent Expenditures    88310 non-null object
Interest Rate                       240 non-null object
dtypes: float64(2), object(13

In [53]:
for col in columns:
    print("* " + col)

* Address
* Address Book Type
* Agent
* Aggregate
* Amount
* Associations
* Check
* CoSigner Obligations
* Description
* Due Date
* Employer Name
* Exam Letter Date
* Filed Date
* In-Kind/Independent Expenditures
* Interest Rate


### Print unique values for specific column:

In [17]:
col_check = 'In-Kind/Independent Expenditures'
uniques = np.unique(subset_df[col_check][subset_df[col_check].isna() == False])
for u in uniques:
    print(u)
print(len(uniques))

, 
In-Kind Expenditure - 10267 - $2.76, , 
In-Kind Expenditure - Annual Sessions Campaign - $1,200.00, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,163.80, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,168.48, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,181.79, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,217.64, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,260.45, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,295.89, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,323.50, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,414.36, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $1,422.96, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $193.79, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $234.82, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $3,846.43, , 
In-Kind Expenditure - Ben Unger for Oregon (15238) - $352.71, , 
In-Kind Expenditure - Ben Ung

# Other columns:

* Transaction Detail:
    * Some have commas in the string.

In [20]:
def colum_name_gen():
    """
    Column name gen.
    """
    for x in subset_df.columns:
        yield x

In [64]:
g = colum_name_gen()

In [78]:
col = next(g)
subset_df[col][subset_df[col].isna() == False]

1          $648.00
2          $250.00
3          $250.00
4          $200.00
5          $200.00
6          $250.00
7          $400.00
8          $200.00
9          $200.00
10         $200.00
11         $200.00
12         $200.00
13         $200.00
14         $200.00
15         $200.00
16         $200.00
17         $200.00
18         $200.00
19         $595.00
20         $415.00
21         $250.00
22         $210.00
27       $1,000.00
28         $500.00
29         $500.00
30       $1,000.00
31         $500.00
32         $200.00
34         $150.00
35         $150.00
           ...    
88227    $5,208.28
88228    $4,199.00
88232    $3,100.00
88234    $1,771.00
88235    $1,000.00
88239      $300.19
88240      $200.00
88244      $250.00
88245      $250.00
88250    $1,064.00
88251      $350.00
88254      $200.00
88260      $500.00
88261      $429.00
88262      $112.00
88267      $250.00
88273      $200.00
88276    $1,000.00
88283      $901.00
88285      $139.99
88286      $300.19
88291      $

In [74]:
for x in np.unique(subset_df[col][subset_df[col].isna() == False]):
    print('* ' + x)

* Adams & Co.
* Astley Consulting Group
* Media Connections West, Inc.
* Media Plus
* Thomas Public Affairs
* Ugf


In [88]:
subset_df[["Amount", "Aggregate"]][subset_df["Amount"] != subset_df["Aggregate"]]

Unnamed: 0,Amount,Aggregate
0,$200.00,
1,$53.00,$648.00
19,$180.00,$595.00
20,$205.00,$415.00
23,$30.00,
24,$90.00,
25,$20.00,
26,$275.00,
30,$500.00,"$1,000.00"
32,$100.00,$200.00


In [91]:
transaction_detail_df[["Amount", "Aggregate", "Transaction ID"]][transaction_detail_df["Amount"] != transaction_detail_df["Aggregate"]][transaction_detail_df["Amount"] == "$6,000.00"]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Amount,Aggregate,Transaction ID
57,"$6,000.00","$24,991.61",2254782
208,"$6,000.00",,2569680
751,"$6,000.00","$66,585.64",2468384
974,"$6,000.00","$16,866.21",1581158
1054,"$6,000.00","$10,125.00",1560566
7447,"$6,000.00","$9,000.00",489006
12572,"$6,000.00","$17,135.10",1836084
12600,"$6,000.00","$11,135.10",1833836
13285,"$6,000.00","$8,500.00",1251485
14711,"$6,000.00","$35,282.26",2403735


In [82]:
for x in transaction_detail_df.columns:
    print(x)

Address
Address Book Type
Agent
Aggregate
Amount
Associations
Check
CoSigner Obligations
Description
Due Date
Employer Name
Exam Letter Date
Filed Date
In-Kind/Independent Expenditures
Interest Rate
Name
Occupation
Occupation Letter Date
Payer of Personal Expenditure
Payment Method
Process Status
Purpose
Repayment Schedule
Transaction Date
Transaction ID
Transaction Sub Type
Transaction Type


In [84]:
transaction_detail_df["Transaction ID"]

0        1454151
1         968825
2         934239
3         934242
4         934247
5         934251
6         934253
7         934256
8         934258
9         934260
10        934265
11        934268
12        934272
13        934276
14        934277
15        934279
16        934284
17        934288
18        808143
19        808144
20        796741
21        744560
22        719263
23        719239
24        719238
25        629364
26         59276
27       2115268
28       2078859
29       1948856
          ...   
88280      38907
88281      38906
88282      38905
88283      28731
88284      38144
88285    50189, 
88286    50182, 
88287      36615
88288      36614
88289      36612
88290      36613
88291      57606
88292      57607
88293      36029
88294      25006
88295      35650
88296      24456
88297      35206
88298      23843
88299      34250
88300      23041
88301      32891
88302      16125
88303      16118
88304      16137
88305      26967
88306    29612, 
88307      161