### Sex Ratio of Children of Indian Politicians

In [1]:
import os
import json
import requests
import pandas as pd
import scipy.stats as stats
import numpy as np

In [2]:
df_list = []

for filename in os.listdir("data/"):
    print(filename)
    with open(os.path.join("data/", filename), "r") as f:
            data = json.load(f)
    df = pd.DataFrame(data['membersDtoList'])
    df['ls'] = filename[3:-5]
    df_list.append(df)

df = pd.concat(df_list)

ls_17.json
ls_16.json
ls_13.json
ls_14.json
ls_15.json
ls_12.json


In [3]:
df.shape

(3196, 33)

In [4]:
## Filter out NaNs for Sons and Daughters. Assume missing at random.

print("missing data on sons:", df['numberOfSons'].isna().sum())
print("missing data on daughters:", df['numberOfDaughters'].isna().sum())

# Same rows
# ----------------

df_small = df.loc[~ (df['numberOfSons'].isna() & df['numberOfDaughters'].isna())]
df_small.shape

missing data on sons: 213
missing data on daughters: 213


(2983, 33)

### Unique MPs

In [5]:
len(df_small['mpsno'].unique())

1785

In [6]:
len((df_small['firstName'] + df_small['lastName']).unique())

1781

In [7]:
unique_df = df_small.drop_duplicates(subset='mpsno', keep='last')
unique_df.shape

(1785, 33)

### Agg. Sex Ratio

In [8]:
sum(df_small['numberOfSons'])/sum(df_small['numberOfDaughters'])

1.0831937465103294

In [9]:
# Among unique
sum(unique_df['numberOfSons'])/sum(unique_df['numberOfDaughters'])

1.0854059127170343

### Standard Error of the Sex Ratio

In [10]:
def calculate_ratio(data):
    numerator = np.sum(data['numberOfSons'])
    denominator = np.sum(data['numberOfDaughters'])
    return numerator / denominator

In [11]:
n_iterations = 1000  # Number of bootstrap iterations
sample_size = len(unique_df)  # Size of each bootstrap sample
ratios = []

np.random.seed(314)

for _ in range(n_iterations):
    sample = unique_df.sample(n=sample_size, replace=True)
    ratio = calculate_ratio(sample)
    ratios.append(ratio)

np.std(ratios)

0.030860658898348124

### Proportion of Daughters

In [12]:
# Filter on ls members with kids
unique_df['total_kids'] = unique_df['numberOfDaughters'] + unique_df['numberOfSons']

df_kids = unique_df[unique_df['total_kids'] > 0]
print(df_kids.shape)

df_kids['prop_daughter'] = df_kids['numberOfDaughters']/df_kids['total_kids']

print(df_kids['prop_daughter'].mean())

(1615, 34)
0.4618754104512619


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['total_kids'] = unique_df['numberOfDaughters'] + unique_df['numberOfSons']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kids['prop_daughter'] = df_kids['numberOfDaughters']/df_kids['total_kids']


In [26]:
unique_df['total_kids'].sum()

4444.0

In [13]:
# Perform the t-test
t, p = stats.ttest_1samp(df_kids['prop_daughter'], popmean = 0.4878)

# Print the results
print('t =', t)
print('p =', p)

t = -3.206693900354379
p = 0.0013690005086026743


In [14]:
df_kids.groupby('ls')['prop_daughter'].mean()

ls
12    0.461634
13    0.492887
14    0.450286
15    0.460336
16    0.477449
17    0.437431
Name: prop_daughter, dtype: float64

In [15]:
df_kids_bjp = df_kids[df_kids['partySname'] == 'BJP']
# Perform the t-test
t, p = stats.ttest_1samp(df_kids_bjp['prop_daughter'], popmean = 0.4878)

print(df_kids_bjp['prop_daughter'].mean())
# Print the results
print('t =', t)
print('p =', p)

0.44812636461851124
t = -3.0021821529373365
p = 0.0027975426123936996


In [16]:
df_kids_inc = df_kids[df_kids['partySname'] == 'INC']
# Perform the t-test
t, p = stats.ttest_1samp(df_kids_inc['prop_daughter'], popmean = 0.4878)

print(df_kids_inc['prop_daughter'].mean())
# Print the results
print('t =', t)
print('p =', p)

0.47868814192343606
t = -0.5261361884605241
p = 0.5991376145078726


In [17]:
# Create two DataFrames for the two groups
bjp    = df_kids[df_kids['partySname'] == 'BJP']
others = df_kids[~ (df_kids['partySname'] == 'BJP')]

# Perform the t-test
t_statistic, p_value = stats.ttest_ind(bjp['prop_daughter'], others['prop_daughter'])

# Print the results
print(bjp['prop_daughter'].mean())
print(others['prop_daughter'].mean())
print('t-statistic:', t_statistic)
print('p-value:', p_value)

0.44812636461851124
0.46943606617311034
t-statistic: -1.2613660340458093
p-value: 0.20735938907130408


### Correlation Between Number of Kids and Proportion Daughters 

For more info., see here: https://github.com/soodoku/prop_male/

In [18]:
df_kids[['ls', 'total_kids', 'prop_daughter']].groupby('ls').corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_kids,prop_daughter
ls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12,total_kids,1.0,0.142198
12,prop_daughter,0.142198,1.0
13,total_kids,1.0,0.178687
13,prop_daughter,0.178687,1.0
14,total_kids,1.0,0.054113
14,prop_daughter,0.054113,1.0
15,total_kids,1.0,0.139038
15,prop_daughter,0.139038,1.0
16,total_kids,1.0,0.002264
16,prop_daughter,0.002264,1.0


### Mean Prop Daughter by Number of Kids

In [19]:
df_kids[['total_kids', 'prop_daughter']]\
                            .groupby(['total_kids'])\
                            .agg({'prop_daughter': ['mean', 'count']}).reset_index()

Unnamed: 0_level_0,total_kids,prop_daughter,prop_daughter
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
0,1.0,0.458763,194
1,2.0,0.422619,672
2,3.0,0.465789,380
3,4.0,0.515,200
4,5.0,0.546154,104
5,6.0,0.570707,33
6,7.0,0.529412,17
7,8.0,0.545455,11
8,9.0,0.666667,2
9,11.0,0.545455,1


### Average number of kids

In [20]:
unique_df['total_kids'].mean()

2.4896358543417367

In [21]:
unique_df.groupby('ls')['total_kids'].mean()

ls
12    2.818462
13    2.821839
14    2.656934
15    2.490783
16    2.201954
17    2.036900
Name: total_kids, dtype: float64

### Sex Ratios

In [22]:
# Not using unique because I am grouping by LS
adf = df.groupby('ls')[['numberOfSons', 'numberOfDaughters']].sum().reset_index()
adf['sex_ratio'] = adf['numberOfSons']/adf['numberOfDaughters']
adf

Unnamed: 0,ls,numberOfSons,numberOfDaughters,sex_ratio
0,12,472.0,444.0,1.063063
1,13,775.0,742.0,1.044474
2,14,755.0,676.0,1.116864
3,15,655.0,630.0,1.039683
4,16,609.0,552.0,1.103261
5,17,614.0,538.0,1.141264


#### By Party

In [27]:
adf = unique_df.groupby('partySname')[['numberOfSons', 'numberOfDaughters']].sum().reset_index()
adf['sex_ratio'] = adf['numberOfSons']/adf['numberOfDaughters']
pdf = adf[(adf['numberOfDaughters'] + adf['numberOfSons']) > 100].sort_values('sex_ratio', ascending = False)[0:10]
pdf

Unnamed: 0,partySname,numberOfSons,numberOfDaughters,sex_ratio
17,BSP,103.0,71.0,1.450704
31,JD(U),66.0,47.0,1.404255
15,BJP,858.0,751.0,1.142477
20,CPI(M),74.0,67.0,1.104478
59,SP,80.0,75.0,1.066667
63,TDP,64.0,61.0,1.04918
24,INC,470.0,493.0,0.953347
22,DMK,49.0,52.0,0.942308
4,AIADMK,57.0,61.0,0.934426


In [24]:
# Perform the chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(pd.crosstab(pdf['numberOfSons'], ['numberOfDaughters']))

# Print the results
print('chi2:', chi2)
print('p-value:', p_value)

chi2: 0.0
p-value: 1.0


In [25]:
adf = df.groupby(['ls', 'partySname'])[['numberOfSons', 'numberOfDaughters']].sum().reset_index()
adf['sex_ratio'] = adf['numberOfSons']/adf['numberOfDaughters']
adf.sort_values('numberOfSons', ascending = False)[0:20]

Unnamed: 0,ls,partySname,numberOfSons,numberOfDaughters,sex_ratio
221,17,BJP,364.0,315.0,1.155556
185,16,BJP,351.0,312.0,1.125
58,13,BJP,286.0,268.0,1.067164
153,15,INC,231.0,225.0,1.026667
102,14,BJP,218.0,190.0,1.147368
9,12,BJP,202.0,188.0,1.074468
109,14,INC,200.0,194.0,1.030928
145,15,BJP,170.0,155.0,1.096774
65,13,INC,157.0,190.0,0.826316
17,12,INC,93.0,104.0,0.894231
