# Computing the median

Works on things like # of words, # of interventions etc. E.g. what's the median # of interventions in 2016?

In [67]:
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np

int_year = pd.read_csv('new_data/int_year.csv').set_index('PersonIdCode')

n_int = pd.read_csv('new_data/n_int.csv').set_index('PersonIdCode')

ppl = pd.read_csv('data/people_jonas.csv').set_index('PersonIdCode')

When computing the median, we need to ignore people who aren't active yet.

For the DataFrames with **sessions** as columns:

In [2]:
def get_median(df, ppl, what):
    cols = df.columns.tolist()
    median_dict = {'what': what}
    for col in cols:
        sesh = df[col]
        sesh = strip_sesh(col, sesh, ppl)
        median = sesh.median()
        median_dict[col] = median
    return median_dict
        
def strip_sesh(col, sesh, ppl):
    #print(sesh.shape)
    for idcode, inter in sesh.iteritems():
        first_sesh = ppl.loc[idcode, 'FirstSession']
        if int(first_sesh) > int(col):
            sesh.pop(idcode)
    #print('-->'+str(sesh.shape))
    return sesh

For the DataFrames with **years** as columns:

In [68]:
def get_median(df, ppl, what, council):
    cols = df.columns.tolist()
    median_dict = {'what': what}
    for col in cols:
        sesh = df[col]
        sesh = strip_year(col, sesh, ppl, council)
        median = sesh.median()
        median_dict[col] = median
    return median_dict
        
def strip_year(col, sesh, ppl, council):
    #print(sesh.shape)
    for idcode, inter in sesh.iteritems():
        first_sesh = int(ppl.loc[idcode, 'FirstSessionDate'][:4])
        this_council = ppl.loc[idcode, 'CouncilAbbreviation']
        if (int(first_sesh) > int(col)) or (this_council != council):
            sesh.pop(idcode)
    #print('-->'+str(sesh.shape))
    return sesh

Weirdly have to re-import int_year before computing each dict

In [69]:
#median_dict_CE = get_median(int_year, ppl, 'CE_median_interventions', 'CE')
#median_dict_CN = get_median(int_year, ppl, 'CN_median_interventions', 'CN')
median_dict_CF = get_median(int_year, ppl, 'CF_median_interventions', 'CF')

In [71]:
median_dict_CF

{'2000': 22.0,
 '2001': 12.0,
 '2002': 31.0,
 '2003': 19.5,
 '2004': 24.0,
 '2005': 27.0,
 '2006': 33.0,
 '2007': 20.0,
 '2008': 26.0,
 '2009': 94.0,
 '2010': 96.0,
 '2011': 230.0,
 '2012': 210.0,
 '2013': 172.0,
 '2014': 208.0,
 '2015': 190.0,
 '2016': 243.0,
 'what': 'CF_median_interventions'}

Creating the medians DataFrame

In [72]:
cols = int_year.columns.tolist()

cols.extend(['what'])

medians = pd.DataFrame(columns=cols)

medians = medians.append(median_dict_CE, ignore_index=True)
medians = medians.append(median_dict_CN, ignore_index=True)
medians = medians.append(median_dict_CF, ignore_index=True)

medians = medians.set_index('what')

In [73]:
medians

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
what,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
CE_median_interventions,15.5,19.0,13.0,15.0,21.0,16.0,16.0,14.0,16.5,27.0,37.0,25.5,38.0,36.0,43.0,41.0,38.0
CN_median_interventions,8.0,10.0,16.0,3.5,12.0,17.0,17.0,13.0,17.0,17.0,16.5,14.0,16.0,17.0,19.0,19.0,21.0
CF_median_interventions,22.0,12.0,31.0,19.5,24.0,27.0,33.0,20.0,26.0,94.0,96.0,230.0,210.0,172.0,208.0,190.0,243.0


Saving

In [74]:
medians.to_csv('new_data/year_medians.csv')