In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt


In [2]:
#Import data
df = pd.read_csv('med-abx1-3.7-medi-cal-age-and-gender-q221.csv')

In [3]:
df

Unnamed: 0,Year,Reporting Period,Age Group,Gender,Number of Eligible Individuals,Unnamed: 5
0,2016,2016 Q1,0 to 17,Male,70652,
1,2016,2016 Q1,18 to 25,Male,21602,
2,2016,2016 Q1,26 to 34,Male,30515,
3,2016,2016 Q1,35 to 44,Male,19054,
4,2016,2016 Q1,45 to 54,Male,20787,
...,...,...,...,...,...,...
303,2021,2021 Q2,26 to 34,Female,18763,
304,2021,2021 Q2,35 to 44,Female,12114,
305,2021,2021 Q2,45 to 54,Female,11214,
306,2021,2021 Q2,55 to 64,Female,9964,


In [4]:
#to identify what type the columns are
df.dtypes

Year                                 int64
Reporting Period                    object
Age Group                           object
Gender                              object
Number of Eligible Individuals       int64
Unnamed: 5                         float64
dtype: object

In [5]:
#to count the amount of times there’s a null value in a column
df.isnull().sum()

Year                                 0
Reporting Period                     0
Age Group                            0
Gender                               0
Number of Eligible Individuals       0
Unnamed: 5                         308
dtype: int64

In [6]:
#this code will shows all the unique values in a column
for column in df:
    unique_vals = np.unique(df[column])
    nr_values = len(unique_vals)
    if nr_values < 36:
        print('The number of values for feature {} :{} -- {}'.format(column, nr_values,unique_vals))
    else:
        print('The number of values for feature {} :{}'.format(column, nr_values))

The number of values for feature Year :6 -- [2016 2017 2018 2019 2020 2021]
The number of values for feature Reporting Period :22 -- ['2016 Q1' '2016 Q2' '2016 Q3' '2016 Q4' '2017 Q1' '2017 Q2' '2017 Q3'
 '2017 Q4' '2018 Q1' '2018 Q2' '2018 Q3' '2018 Q4' '2019 Q1' '2019 Q2'
 '2019 Q3' '2019 Q4' '2020 Q1' '2020 Q2' '2020 Q3' '2020 Q4' '2021 Q1'
 '2021 Q2']
The number of values for feature Age Group :7 -- ['0 to 17' '18 to 25' '26 to 34' '35 to 44' '45 to 54' '55 to 64' '65+']
The number of values for feature Gender :2 -- ['Female' 'Male']
The number of values for feature Number of Eligible Individuals  :306
The number of values for feature Unnamed: 5 :308


In [7]:
df.columns

Index(['Year', 'Reporting Period', 'Age Group', 'Gender',
       'Number of Eligible Individuals ', 'Unnamed: 5'],
      dtype='object')

In [8]:
df = df.drop(columns=['Unnamed: 5'])

In [9]:
df.columns

Index(['Year', 'Reporting Period', 'Age Group', 'Gender',
       'Number of Eligible Individuals '],
      dtype='object')

In [10]:
df.groupby(by='Year').sum()

Unnamed: 0_level_0,Number of Eligible Individuals
Year,Unnamed: 1_level_1
2016,1311344
2017,1253887
2018,1042758
2019,1004325
2020,984853
2021,513852


In [11]:
df= df.rename(columns={'Number of Eligible Individuals ': 'NOI'})

In [12]:
pivot1 =  pd.pivot_table(df, values='NOI',
                        index='Year', columns=['Gender'],
                        aggfunc='sum')
pivot1

Gender,Female,Male
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,646384,664960
2017,609278,644609
2018,520294,522464
2019,507891,496434
2020,488530,496323
2021,248736,265116


In [13]:
pivot2 =  pd.pivot_table(df, values='NOI',
                        index='Age Group', columns=['Gender'],
                        aggfunc='sum')
pivot2

Gender,Female,Male
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1
0 to 17,1266502,1309447
18 to 25,356913,254587
26 to 34,473321,473763
35 to 44,287073,368157
45 to 54,253077,305585
55 to 64,238471,259073
65+,145756,119294


In [14]:
df.columns

Index(['Year', 'Reporting Period', 'Age Group', 'Gender', 'NOI'], dtype='object')

## Ho: there is no difference between the number of men and woman
## H1: there is a difference between the number of men and woman

In [15]:
from statsmodels.stats.proportion import proportions_ztest
 
count = 248736 #number of females 
nobs = 513852 #number of rows | or trials 
value = 0.5 # This is the value of the null hypothesis. That means porpotion of men = porpotion of women = 0.5
 
#we are using alternative='two-sided' because we are chcking Pm≠Pw.
#for Pw>Pm we have to set it to "larger" and for Pw<Pm to "smaller"
 
stat, pval = proportions_ztest(count, nobs, value, alternative='two-sided')
 
print("p_value: ",pval)

p_value:  1.1083653307266504e-115


In [25]:
from statsmodels.stats.proportion import proportions_ztest
 
count = 3021113 #number of females 
nobs = 6111019 #number of rows | or trials 
value = 0.5 # This is the value of the null hypothesis. That means porpotion of men = porpotion of women = 0.5
 
#we are using alternative='two-sided' because we are chcking Pm≠Pw.
#for Pw>Pm we have to set it to "larger" and for Pw<Pm to "smaller"
 
stat, pval = proportions_ztest(count, nobs, value, alternative='two-sided')
 
print("p_value: ",pval)

p_value:  1.875182914967605e-170


In [16]:
print("p_value: ",round(pval,3))

p_value:  0.0


## Chi Squared Test
## Ho: Gender and Age Groups are Independent
## H1: Gender and Age Groups are Dependent

In [17]:
pivot2 =  pd.pivot_table(df, values='NOI',
                        index='Gender', columns=['Age Group'],
                        aggfunc='sum')
pivot2

Age Group,0 to 17,18 to 25,26 to 34,35 to 44,45 to 54,55 to 64,65+
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,1266502,356913,473321,287073,253077,238471,145756
Male,1309447,254587,473763,368157,305585,259073,119294


In [18]:
from scipy.stats import chi2_contingency
 
#The easiest way to apply a chi-squared test is to compute the contigency table.
 
contigency= pd.crosstab(df['Gender'], df['Age Group'],values=df['NOI'], aggfunc=sum)
contigency

Age Group,0 to 17,18 to 25,26 to 34,35 to 44,45 to 54,55 to 64,65+
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,1266502,356913,473321,287073,253077,238471,145756
Male,1309447,254587,473763,368157,305585,259073,119294


In [19]:
c, p, dof, expected = chi2_contingency(pivot2)
 
print("p_value: ",p)

p_value:  0.0


In [20]:
c, p, dof, expected = chi2_contingency(contigency)
 
print("p_value: ",p)

p_value:  0.0


## p value is 0. we reject the null hypothesis

In [21]:
print('-' * 570)

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


## Doing a Anova (Analysis Of Variance) test.
## Ho: Groups means of eligibles are equal to one another
## H1: At least, one group mean of eligibles is different from other groups

In [22]:
df.columns

Index(['Year', 'Reporting Period', 'Age Group', 'Gender', 'NOI'], dtype='object')

In [23]:
import scipy.stats as stats
 
# stats f_oneway functions takes the groups as input and returns ANOVA F and p value
fvalue, pvalue = stats.f_oneway(df['NOI'][df['Age Group'] == '0 to 17'],
                                df['NOI'][df['Age Group'] == '18 to 25'],
                                df['NOI'][df['Age Group'] == '26 to 34'],
                                df['NOI'][df['Age Group'] == '35 to 44'],
                                df['NOI'][df['Age Group'] == '45 to 54'],
                                df['NOI'][df['Age Group'] == '55 to 64'],
                                df['NOI'][df['Age Group'] == '65+'])

 
print("p_value: ",round(pvalue,3))

p_value:  0.0


In [24]:
print("p_value: ",pvalue)

p_value:  1.006206412303938e-164


## we reject the null hypothesis. therefore at least one group man of number of eligibles is diff from others