### Data aggregation analysis of campaign finance data in NYC

In this notebook, we will analyze and aggregate raw campaign finance data using the [`.groupby()` function](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html) in pandas. 

We will:
- import dependencies
- open and explore the data
- aggregate the data in different ways

In [1]:
import pandas as pd

In [2]:
dtypes = {
    "RECIPID":"str",
    "DATE":"datetime64",
    "INTZIP":"str",
    "INT_C_CODE":"str"
    
}

campaign_data = pd.read_csv(
    "../data/CFB_20210814143105853.csv"
)

print(
    len(campaign_data),
    campaign_data.columns
)

  exec(code_obj, self.user_global_ns, self.user_ns)


419849 Index(['ELECTION', 'OFFICECD', 'RECIPID', 'CANCLASS', 'RECIPNAME', 'COMMITTEE',
       'FILING', 'SCHEDULE', 'PAGENO', 'SEQUENCENO', 'REFNO', 'DATE',
       'REFUNDDATE', 'NAME', 'C_CODE', 'STRNO', 'STRNAME', 'APARTMENT',
       'BOROUGHCD', 'CITY', 'STATE', 'ZIP', 'OCCUPATION', 'EMPNAME',
       'EMPSTRNO', 'EMPSTRNAME', 'EMPCITY', 'EMPSTATE', 'AMNT', 'MATCHAMNT',
       'PREVAMNT', 'PAY_METHOD', 'INTERMNO', 'INTERMNAME', 'INTSTRNO',
       'INTSTRNM', 'INTAPTNO', 'INTCITY', 'INTST', 'INTZIP', 'INTEMPNAME',
       'INTEMPSTNO', 'INTEMPSTNM', 'INTEMPCITY', 'INTEMPST', 'INTOCCUPA',
       'PURPOSECD', 'EXEMPTCD', 'ADJTYPECD', 'RR_IND', 'SEG_IND',
       'INT_C_CODE'],
      dtype='object')


In [3]:
campaign_data.head()

Unnamed: 0,ELECTION,OFFICECD,RECIPID,CANCLASS,RECIPNAME,COMMITTEE,FILING,SCHEDULE,PAGENO,SEQUENCENO,...,INTEMPSTNM,INTEMPCITY,INTEMPST,INTOCCUPA,PURPOSECD,EXEMPTCD,ADJTYPECD,RR_IND,SEG_IND,INT_C_CODE
0,2021,1,2563,P,"Tirschwell, Sara A",H,7,ABC,,,...,,,,,,,,N,N,
1,2021,55,2345,P,"Camarena, Rodrigo",H,6,ABC,,,...,,,,,,,,N,N,
2,2021,55,2414,P,"Low, Jenny L",H,6,ABC,,,...,,,,,,,,N,N,
3,2021,55,283,P,"Gennaro, James F",M,7,ABC,,,...,,,,,,,,N,N,
4,2021,55,2454,P,"Boghosian Murphy, Leslie",H,6,ABC,,,...,,,,,,,,N,N,


In [4]:
campaign_data[["RECIPNAME","OCCUPATION", "ELECTION", "AMNT", "INTST"]]

Unnamed: 0,RECIPNAME,OCCUPATION,ELECTION,AMNT,INTST
0,"Tirschwell, Sara A",Turnaround Manager,2021,2000.0,
1,"Camarena, Rodrigo",Chief Program Officer,2021,38.0,
2,"Low, Jenny L",Human Resources,2021,25.0,
3,"Gennaro, James F",Speech Therapist,2021,175.0,
4,"Boghosian Murphy, Leslie",Retired,2021,25.0,
...,...,...,...,...,...
419844,"Wiley, Maya D",Consultant,2021,25.0,
419845,"Johnson, Corey D",Accounting,2021,250.0,NY
419846,"Sliwa, Curtis",,2021,25.0,
419847,"Adams, Eric L",Owner,2021,5000.0,


### Aggregate data in different ways 

- recipient name with the most number of donations
- recipient name with the most number of donations and the highest total amounts of donations
- occupation that occur the most in the data by city

In [5]:
#  recipient name with the most number of donations

campaign_data.groupby(
        ["RECIPNAME"]
    )["RECIPID"].count(
    
    ).reset_index(
    
    ).sort_values(
        by="RECIPID",
        ascending=False
)

Unnamed: 0,RECIPNAME,RECIPID
514,"Yang, Andrew",38679
499,"Wiley, Maya D",36389
314,"Morales, Dianne",19305
155,"Garcia, Kathryn A",15240
456,"Stringer, Scott M",14296
...,...,...
331,"O'Hagan, Elizabeth R",1
407,"Sahi, Mandeep S",1
290,"Maynard, Joshua Y",1
277,"Marin, Danny",1


In [6]:
# recipient name with the most number of donations and the highest total amounts of donations
campaign_data.groupby(
        ["RECIPNAME"]
    ).agg(
        {
            "RECIPID":"count",
            "AMNT":"sum"
        }
    ).sort_values(
        by="RECIPID",
        ascending=False
)

Unnamed: 0_level_0,RECIPID,AMNT
RECIPNAME,Unnamed: 1_level_1,Unnamed: 2_level_1
"Yang, Andrew",38679,4069130.66
"Wiley, Maya D",36389,2188843.09
"Morales, Dianne",19305,880476.91
"Garcia, Kathryn A",15240,2157021.12
"Stringer, Scott M",14296,2368647.36
...,...,...
"O'Hagan, Elizabeth R",1,237.15
"Sahi, Mandeep S",1,500.00
"Maynard, Joshua Y",1,100.00
"Marin, Danny",1,40.00


In [7]:
# occupation that occur the most in the data by city
campaign_data.groupby(
        ["OCCUPATION", "CITY"]
    ).agg(
        {
            "RECIPID":"count"
        }
    ).sort_values(
        by="RECIPID",
        ascending=False
)

Unnamed: 0_level_0,Unnamed: 1_level_0,RECIPID
OCCUPATION,CITY,Unnamed: 2_level_1
Not Employed,New York,14269
Not Employed,Brooklyn,8778
Unemployed,New York,7925
Unemployed,Brooklyn,6436
Retired,Brooklyn,5713
...,...,...
Not Employed,Petersburg,1
Not Employed,Peyton,1
Not Employed,Pfafftown,1
Contractor,Glen Head,1
