# Computing the median

Works on things like # of words, # of interventions etc. E.g. what's the median # of interventions in 2016?

In [1]:
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np

int_year = pd.read_csv('data/int_year.csv').set_index('PersonIdCode')

n_int = pd.read_csv('data/n_int.csv').set_index('PersonIdCode')

ppl = pd.read_csv('data/people_jonas.csv').set_index('PersonIdCode')

When computing the median, we need to ignore people who aren't active yet.

For the DataFrames with **sessions** as columns:

In [2]:
def get_median(df, ppl, what):
    cols = df.columns.tolist()
    median_dict = {'what': what}
    for col in cols:
        sesh = df[col]
        sesh = strip_sesh(col, sesh, ppl)
        median = sesh.median()
        median_dict[col] = median
    return median_dict
        
def strip_sesh(col, sesh, ppl):
    #print(sesh.shape)
    for idcode, inter in sesh.iteritems():
        first_sesh = ppl.loc[idcode, 'FirstSession']
        if int(first_sesh) > int(col):
            sesh.pop(idcode)
    #print('-->'+str(sesh.shape))
    return sesh

For the DataFrames with **years** as columns:

In [3]:
def get_median(df, ppl, what):
    cols = df.columns.tolist()
    median_dict = {'what': what}
    for col in cols:
        sesh = df[col]
        sesh = strip_year(col, sesh, ppl)
        median = sesh.median()
        median_dict[col] = median
    return median_dict
        
def strip_year(col, sesh, ppl):
    #print(sesh.shape)
    for idcode, inter in sesh.iteritems():
        first_sesh = int(ppl.loc[idcode, 'FirstSessionDate'][:4])
        if int(first_sesh) > int(col):
            sesh.pop(idcode)
    #print('-->'+str(sesh.shape))
    return sesh

In [4]:
median_dict = get_median(int_year, ppl, 'median_interventions')

Creating the medians DataFrame

In [5]:
cols = int_year.columns.tolist()

cols.extend(['what'])

medians = pd.DataFrame(columns=cols)

medians = medians.append(median_dict, ignore_index=True)

medians = medians.set_index('what')

In [6]:
medians

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
what,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
median_interventions,9.0,13.0,17.0,9.0,15.0,16.5,17.0,14.0,17.0,20.5,20.0,16.0,21.0,23.0,23.0,22.0,19.0


Saving

In [None]:
medians.to_csv('data/year_medians.csv')