# Aggregated dataset

We group the incidents that happened in the same congressional district of each state: label [state, congressional district] (and eventually in the same year, month).

We consider the numerical features of the dataset df_incidents:
- the mean of 'avg_age_participants'
- the sum of the features 'n_participants_child'-'n_participants'

We also add the 'povertyPercentage' feature of df_poverty for each state, year.

Finally, given the year of the incident, we add the 'party', 'candidatevotes', 'totalvotes' features from the dataset df_elections for each [state, congressional district].

In [None]:
# aggregated dataset creation
# n. of incidents per districts and month
df_incidents_agg = df_incidents.groupby(['state', 'congressional_district', 'year_month']).size().reset_index(name='n_incidents')
df_incidents_agg.head()

Unnamed: 0,state,congressional_district,year_month,n_incidents
0,alabama,1.0,14-01,7
1,alabama,1.0,14-02,10
2,alabama,1.0,14-03,7
3,alabama,1.0,14-04,8
4,alabama,1.0,14-05,13


In [None]:
# aggregated dataset creation
# n. of incidents per districts and month
df_incidents_agg = df_incidents.groupby(['state', 'congressional_district', 'year_month']).size().reset_index(name='n_incidents')
df_incidents_agg.head()
df_incidents_agg['n_participants'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['n_participants'].sum().reset_index(name='n_participants')['n_participants']
df_incidents_agg['n_killed'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['n_killed'].sum().reset_index(name='n_killed')['n_killed']
df_incidents_agg['n_injured'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['n_injured'].sum().reset_index(name='n_injured')['n_injured']
df_incidents_agg['n_unharmed'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['n_unharmed'].sum().reset_index(name='n_unharmed')['n_unharmed']
df_incidents_agg['n_participants_child'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['n_participants_child'].sum().reset_index(name='n_participants_child')['n_participants_child']
df_incidents_agg['n_participants_teen'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['n_participants_teen'].sum().reset_index(name='n_participants_teen')['n_participants_teen']
df_incidents_agg['n_participants_adult'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['n_participants_adult'].sum().reset_index(name='n_participants_adult')['n_participants_adult']
df_incidents_agg['avg_age_participants'] = df_incidents.groupby(['state', 'congressional_district', 'year_month'])['avg_age_participants'].mean().reset_index(name='avg_age_participants')['avg_age_participants']


KeyError: 'Column not found: n_killed'

In [None]:

# year and month
df_incidents_agg['year'] = df_incidents_agg['year_month'].str[0:2]
df_incidents_agg['month'] = df_incidents_agg['year_month'].str[3:5]
df_incidents_agg['year'] = df_incidents_agg['year'].astype('int64')
df_incidents_agg['month'] = df_incidents_agg['month'].astype('int64')
df_incidents_agg['year']+=2000
df_incidents_agg.head()
df_incidents_agg['congressional_district'].replace(0, 1, inplace=True)


In [None]:

# join with elections on state and congressional district
df_elections['state'] = df_elections['state'].str.lower()
df_elections['year'] = df_elections['year'].astype('int64')
df_incidents_agg = pd.merge(df_incidents_agg, df_elections, on=['state', 'congressional_district', 'year'], how='left')


In [None]:

# join with poverty on state and year
df_poverty['year'] = df_poverty['year'].astype('int64')
df_poverty['state'] = df_poverty['state'].str.lower()
df_incidents_agg = pd.merge(df_incidents_agg, df_poverty, on=['state', 'year'], how='left')
df_incidents_agg


In [None]:

# fill the missing values of 'party', 'candidatevotes', 'totalvotes' with data from the previous year, same state and same congressional district
def fill_with_previous_year(row):
    if pd.isnull(row['party']):
        # find the previous year
        previous_year = row['year']-1
        # find the previous row
        previous_row = df_incidents_agg[(df_incidents_agg['state']==row['state']) & (df_incidents_agg['congressional_district']==row['congressional_district']) & (df_incidents_agg['year']==previous_year)]
        if previous_row.shape[0]>0:
            # fill the missing values with the values of the previous row
            row['party'] = previous_row['party'].values[0]
            row['candidatevotes'] = previous_row['candidatevotes'].values[0]
            row['totalvotes'] = previous_row['totalvotes'].values[0]
        elif row['congressional_district']==0:
            # fill with the sum of the values of the same state, previous year
            previous_rows = df_incidents_agg[(df_incidents_agg['state']==row['state']) & (df_incidents_agg['year']==previous_year)]
            if previous_rows.shape[0]>0:
                row['candidatevotes'] = previous_rows['candidatevotes'].sum()
                row['totalvotes'] = previous_rows['totalvotes'].sum()
    return row
# apply the function to the dataframe
df_incidents_agg = df_incidents_agg.apply(fill_with_previous_year, axis=1)
