In [1]:
import pandas as pd
import os
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load census
census_2011 ="Census2011data.xlsx"
df_census=pd.read_excel(census_2011,usecols= ["State","Level","Name","TRU","TOT_P"])

In [3]:
#census preprocessing 

df_census=df_census[df_census["TRU"]!="Total"]
df_census =df_census[df_census["Level"] != "DISTRICT"]
df_census = df_census[["State","TRU","TOT_P"]]
df_census['State']=df_census['State'].astype(int)
df_census.reset_index(drop=True,inplace=True)
df_census

Unnamed: 0,State,TRU,TOT_P
0,0,Rural,833748852
1,0,Urban,377106125
2,1,Rural,9108060
3,1,Urban,3433242
4,2,Rural,6176050
...,...,...,...
67,33,Urban,34917440
68,34,Rural,395200
69,34,Urban,852753
70,35,Rural,237093


In [4]:
#load c18
c18="DDW-C18-0000.xlsx"
df_c18=pd.read_excel(c18,skiprows=[0,4,5])

In [5]:
#c18 preprocessing 
df_c18=df_c18[df_c18["Age-group"]=="Total"]
df_c18=df_c18[df_c18["Total/"]!="Total"]
df_c18.rename(columns = {'Area Name':'Name',
                         'Total/':'TRU'},inplace=True)
df_c18=df_c18[["State","TRU","Number speaking second language","Number speaking third language"]]
df_c18['State']=df_c18['State'].astype(int)
df_c18.reset_index(drop=True,inplace=True)
df_c18

Unnamed: 0,State,TRU,Number speaking second language,Number speaking third language
0,0,Rural,162641485,35383989
1,0,Urban,152347285,50625591
2,1,Rural,4167238,1258724
3,1,Urban,2015952,837496
4,2,Rural,981518,280817
...,...,...,...,...
67,33,Urban,12325853,1878494
68,34,Rural,80981,5029
69,34,Urban,311311,66636
70,35,Rural,152600,38136


In [6]:
#taking rural and urban data in separate data frames

df_census_rural = df_census[df_census["TRU"]=="Rural"]
df_census_rural.reset_index(drop=True,inplace=True)

df_census_urban = df_census[df_census["TRU"]=="Urban"]
df_census_urban.reset_index(drop=True,inplace=True)

df_c18_rural=df_c18[df_c18["TRU"]=="Rural"]
df_c18_rural.reset_index(drop=True,inplace=True)

df_c18_urban=df_c18[df_c18["TRU"]=="Urban"]
df_c18_urban.reset_index(drop=True,inplace=True)

In [7]:
df_1=pd.DataFrame()
df_2=pd.DataFrame()
df_3=pd.DataFrame()

#calculating one lang, two lang, three lang percentages for urban and rural speakers.
df_1["state/ut"]=df_census_rural["State"]
df_1["urban-percentage"]=((df_census_urban["TOT_P"]-df_c18_urban["Number speaking second language"])/df_census_urban["TOT_P"])*100
df_1["rural-percentage"]=((df_census_rural["TOT_P"]-df_c18_rural["Number speaking second language"])/df_census_rural["TOT_P"])*100

df_2["state/ut"]=df_census_rural["State"]
df_2["urban-percentage"]=((df_c18_urban["Number speaking second language"]-df_c18_urban["Number speaking third language"])/df_census_urban["TOT_P"])*100
df_2["rural-percentage"]=((df_c18_rural["Number speaking second language"]-df_c18_rural["Number speaking third language"])/df_census_rural["TOT_P"])*100

df_3["state/ut"]=df_census_rural["State"]
df_3["urban-percentage"]=((df_c18_urban["Number speaking third language"])/df_census_urban["TOT_P"])*100
df_3["rural-percentage"]=((df_c18_rural["Number speaking third language"])/df_census_rural["TOT_P"])*100

P-value calculation 

In [8]:
df_p=pd.DataFrame()
df_p["monoratio"]=(df_1["urban-percentage"]*df_census_urban["TOT_P"])/(df_1["rural-percentage"]*df_census_rural["TOT_P"])
df_p["biratio"]=(df_2["urban-percentage"]*df_census_urban["TOT_P"])/(df_2["rural-percentage"]*df_census_rural["TOT_P"])
df_p["triratio"]=(df_3["urban-percentage"]*df_census_urban["TOT_P"])/(df_3["rural-percentage"]*df_census_rural["TOT_P"])

#using stats.ttset_1samo to find the p-value
df_p["tot_u_r"]=df_census_urban["TOT_P"]/df_census_rural["TOT_P"]
df_p['p-value']=df_p.apply(lambda row:stats.ttest_1samp([row.monoratio,row.biratio,row.triratio],popmean=row.tot_u_r)[1],axis=1)

df_1["p-value"]=df_p["p-value"]
df_2["p-value"]=df_p["p-value"]
df_3["p-value"]=df_p["p-value"]

In [9]:
df_1

Unnamed: 0,state/ut,urban-percentage,rural-percentage,p-value
0,0,59.600952,80.492749,0.332407
1,1,41.281389,54.2467,0.568997
2,2,62.067208,84.107674,0.279581
3,3,42.764108,59.161084,0.523689
4,4,44.64494,59.563313,0.592084
5,5,69.746712,88.130788,0.263868
6,6,67.543705,83.297098,0.270791
7,7,58.955694,76.443173,0.39907
8,8,81.076115,91.753404,0.236601
9,9,73.13025,92.970584,0.217139


In [10]:
#write output in csv

outname_1 = 'geography-india-a.csv' #mono
outname_2 = 'geography-india-b.csv' #bi
outname_3 = 'geography-india-c.csv' #tri
outfiles = './Output_Files'
if not os.path.exists(outfiles):
    os.mkdir(outfiles)
outdir=outfiles+'/Q3'
if not os.path.exists(outdir):
    os.mkdir(outdir)
filepath_1 = os.path.join(outdir, outname_1)
filepath_2 = os.path.join(outdir, outname_2)
filepath_3 = os.path.join(outdir, outname_3)

df_1.to_csv(filepath_1,index=False)
df_2.to_csv(filepath_2,index=False)
df_3.to_csv(filepath_3,index=False)