In [1]:
import pandas as pd
import os
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load census file

census_2011 ="Census2011data.xlsx"
df_census=pd.read_excel(census_2011,usecols= ["State","Level","Name","TRU","TOT_M","TOT_F"])

In [3]:
#preprocessing 

df_census=df_census[df_census["TRU"]=="Total"]
df_census =df_census[df_census["Level"] != "DISTRICT"]
df_census = df_census[["State","TOT_M","TOT_F"]]
df_census['State']=df_census['State'].astype(int)
df_census.reset_index(drop=True,inplace=True)

In [4]:
#load c18 file 
c18="DDW-C18-0000.xlsx"
df_c18=pd.read_excel(c18,skiprows=[0,4,5])

#preprocessing 
df_c18=df_c18[df_c18["Age-group"]=="Total"]
df_c18=df_c18[df_c18["Total/"]=="Total"]
df_c18.rename(columns = {'Area Name':'Name',
                         'Unnamed: 6':'Males two languages',
                         'Unnamed: 7':'Females two languages',
                         'Unnamed: 9':'Males three languages',
                         'Unnamed: 10':'Females three languages'},inplace=True)
df_c18=df_c18[["State","Males two languages","Females two languages","Males three languages","Females three languages"]]
df_c18["State"]=df_c18["State"].astype(int)
df_c18.reset_index(drop=True,inplace=True)

In [5]:
#merge census and c18 data

df_merge = pd.merge(df_census,df_c18,how='inner',left_on='State',right_on = 'State')

In [6]:
df_1=pd.DataFrame()
df_2=pd.DataFrame()
df_3=pd.DataFrame()

#calculate one lang, two lang, three lang speakers in male and female population

df_1["state/ut"]=df_merge["State"]
df_1["male-percentage"]=((df_merge["TOT_M"]-df_merge["Males two languages"])/df_merge["TOT_M"])*100
df_1["female-percentage"]=((df_merge["TOT_F"]-df_merge["Females two languages"])/df_merge["TOT_F"])*100

df_2["state/ut"]=df_merge["State"]
df_2["male-percentage"]=((df_merge["Males two languages"]-df_merge["Males three languages"])/df_merge["TOT_M"])*100
df_2["female-percentage"]=((df_merge["Females two languages"]-df_merge["Females three languages"])/df_merge["TOT_F"])*100

df_3["state/ut"]=df_merge["State"]
df_3["male-percentage"]=((df_merge["Males three languages"])/df_merge["TOT_M"])*100
df_3["female-percentage"]=((df_merge["Females three languages"])/df_merge["TOT_F"])*100

In [7]:
df_merge["male-percentage-1"]=df_1["male-percentage"]
df_merge["male-percentage-2"]=df_2["male-percentage"]
df_merge["male-percentage-3"]=df_3["male-percentage"]

df_merge["female-percentage-1"]=df_1["female-percentage"]
df_merge["female-percentage-2"]=df_2["female-percentage"]
df_merge["female-percentage-3"]=df_3["female-percentage"]

In [8]:
df_1

Unnamed: 0,state/ut,male-percentage,female-percentage
0,0,71.650118,76.464264
1,1,45.264207,56.811956
2,2,80.356377,83.482596
3,3,50.242833,56.111899
4,4,44.391497,45.86583
5,5,80.871829,84.33887
6,6,75.85988,80.013744
7,7,57.802154,61.224134
8,8,87.651767,90.656045
9,9,87.260869,89.967989


# P-Value calculation 

In [9]:
df_merge["monoratio"] = (df_merge["male-percentage-1"]*df_merge["TOT_M"])/(df_merge["female-percentage-1"]*df_merge["TOT_F"])
df_merge["biratio"] = (df_merge["male-percentage-2"]*df_merge["TOT_M"])/(df_merge["female-percentage-2"]*df_merge["TOT_F"])
df_merge["triratio"] = (df_merge["male-percentage-3"]*df_merge["TOT_M"])/(df_merge["female-percentage-3"]*df_merge["TOT_F"])

#using ttest_1samp and taking second element of the returned array which is the p value
df_merge["tot_m_f"] = df_merge["TOT_M"]/df_merge["TOT_F"]
df_merge['p-value']=df_merge.apply(lambda row:stats.ttest_1samp([row.monoratio,row.biratio,row.triratio],popmean=row.tot_m_f)[1],axis=1)

df_1["p-value"]=df_merge["p-value"]
df_2["p-value"]=df_merge["p-value"]
df_3["p-value"]=df_merge["p-value"]

In [10]:
#load to csv

outname_1 = 'gender-india-a.csv' #mono
outname_2 = 'gender-india-b.csv' #bi
outname_3 = 'gender-india-c.csv' #tri
outfiles = './Output_Files'

if not os.path.exists(outfiles):
    os.mkdir(outfiles)
outdir=outfiles+'/Q2'
if not os.path.exists(outdir):
    os.mkdir(outdir)

filepath_1 = os.path.join(outdir, outname_1)
filepath_2 = os.path.join(outdir, outname_2)
filepath_3 = os.path.join(outdir, outname_3)

df_1.to_csv(filepath_1,index=False)
df_2.to_csv(filepath_2,index=False)
df_3.to_csv(filepath_3,index=False)