In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
#loading Dataset and picking only useful columns
df_lan = pd.read_excel("../Dataset/Input_Dataset/DDW-C18-0000.xlsx", header = [1])
df_lan = df_lan[((df_lan.iloc[:, 4] == "Total") & (df_lan.iloc[:, 3] == "Total"))].iloc[:, [0, 2, 6, 7, 9, 10]]

#Changing the column name
df_lan.columns = ["State Code", "Name", "Male two lang", "Female two lang", "Male three lang", "Female three lang"]
df_lan.head()

Unnamed: 0,State Code,Name,Male two lang,Female two lang,Male three lang,Female three lang
4,0,INDIA,176696383,138292387,50536832,35472748
34,1,JAMMU & KASHMIR,3634819,2548371,1258756,837464
64,2,HIMACHAL PRADESH,683966,558739,200478,146808
94,3,PUNJAB,7284183,5751041,4383841,3446072
124,4,CHANDIGARH,322898,257022,178557,143422


In [3]:
#loading Dataset and picking only useful columns and converting the Name column to upper letter
df_cen = pd.read_excel("../Dataset/Input_Dataset/DDW_PCA0000_2011_Indiastatedist.xlsx")
df_cen = df_cen[((df_cen["TRU"] == "Total") & ((df_cen["Level"] == "India") | (df_cen["Level"] == "STATE")))].iloc[:, [7, 11, 12]]
df_cen["Name"] = df_cen["Name"].apply(lambda x : x.upper())
df_cen.head()

Unnamed: 0,Name,TOT_M,TOT_F
0,INDIA,623270258,587584719
3,JAMMU & KASHMIR,6640662,5900640
72,HIMACHAL PRADESH,3481873,3382729
111,PUNJAB,14639465,13103873
174,CHANDIGARH,580663,474787


In [4]:
#Merging df_lan and df_cen 
df_final = pd.merge(df_lan, df_cen, on = "Name")

#Calculating the number number of male and female speaking exactly one language
df_final["Male Exact one lang"] = df_final["TOT_M"] - df_final["Male two lang"]
df_final["Female Exact one lang"] = df_final["TOT_F"] - df_final["Female two lang"]
df_final["Male Exact two lang"] = df_final["Male two lang"] - df_final["Male three lang"]
df_final["Female Exact two lang"] = df_final["Female two lang"] - df_final["Female three lang"]

#Renaming the column name
df_final = df_final.rename(columns={"Male three lang" : "Male Exact three lang", "Female three lang" : "Female Exact three lang"})

#Droping the extra columns
df_final.drop(["Male two lang", "Female two lang"], axis = 1, inplace=True)
df_final.head()

Unnamed: 0,State Code,Name,Male Exact three lang,Female Exact three lang,TOT_M,TOT_F,Male Exact one lang,Female Exact one lang,Male Exact two lang,Female Exact two lang
0,0,INDIA,50536832,35472748,623270258,587584719,446573875,449292332,126159551,102819639
1,1,JAMMU & KASHMIR,1258756,837464,6640662,5900640,3005843,3352269,2376063,1710907
2,2,HIMACHAL PRADESH,200478,146808,3481873,3382729,2797907,2823990,483488,411931
3,3,PUNJAB,4383841,3446072,14639465,13103873,7355282,7352832,2900342,2304969
4,4,CHANDIGARH,178557,143422,580663,474787,257765,217765,144341,113600


In [5]:
#Calculating the ratio for calculation of p-values
df_final["Ratio_1"] = df_final["Male Exact one lang"] / df_final["Female Exact one lang"]
df_final["Ratio_2"] = df_final["Male Exact two lang"] / df_final["Female Exact two lang"]
df_final["Ratio_3"] = df_final["Male Exact three lang"] / df_final["Female Exact three lang"]
df_final["Ratio"] = df_final["TOT_M"] / df_final["TOT_F"]

In [6]:
df_final.head()

Unnamed: 0,State Code,Name,Male Exact three lang,Female Exact three lang,TOT_M,TOT_F,Male Exact one lang,Female Exact one lang,Male Exact two lang,Female Exact two lang,Ratio_1,Ratio_2,Ratio_3,Ratio
0,0,INDIA,50536832,35472748,623270258,587584719,446573875,449292332,126159551,102819639,0.993949,1.226999,1.424666,1.060733
1,1,JAMMU & KASHMIR,1258756,837464,6640662,5900640,3005843,3352269,2376063,1710907,0.896659,1.388774,1.503057,1.125414
2,2,HIMACHAL PRADESH,200478,146808,3481873,3382729,2797907,2823990,483488,411931,0.990764,1.173711,1.36558,1.029309
3,3,PUNJAB,4383841,3446072,14639465,13103873,7355282,7352832,2900342,2304969,1.000333,1.2583,1.272127,1.117186
4,4,CHANDIGARH,178557,143422,580663,474787,257765,217765,144341,113600,1.183684,1.270607,1.244976,1.222997


In [7]:
#Calculating the p-values using ttest_1samp
df_final["p-value"] = df_final.apply(lambda row : stats.ttest_1samp([row.Ratio_1, row.Ratio_2, row.Ratio_3], popmean = row.Ratio)[1], axis = 1)

In [8]:
df_final.head()

Unnamed: 0,State Code,Name,Male Exact three lang,Female Exact three lang,TOT_M,TOT_F,Male Exact one lang,Female Exact one lang,Male Exact two lang,Female Exact two lang,Ratio_1,Ratio_2,Ratio_3,Ratio,p-value
0,0,INDIA,50536832,35472748,623270258,587584719,446573875,449292332,126159551,102819639,0.993949,1.226999,1.424666,1.060733,0.340433
1,1,JAMMU & KASHMIR,1258756,837464,6640662,5900640,3005843,3352269,2376063,1710907,0.896659,1.388774,1.503057,1.125414,0.537034
2,2,HIMACHAL PRADESH,200478,146808,3481873,3382729,2797907,2823990,483488,411931,0.990764,1.173711,1.36558,1.029309,0.30633
3,3,PUNJAB,4383841,3446072,14639465,13103873,7355282,7352832,2900342,2304969,1.000333,1.2583,1.272127,1.117186,0.568812
4,4,CHANDIGARH,178557,143422,580663,474787,257765,217765,144341,113600,1.183684,1.270607,1.244976,1.222997,0.733277


In [9]:
#Extracting only useful columns
gender_india_1 = df_final.loc[:, ["State Code", "Male Exact one lang", "Female Exact one lang", "TOT_M", "TOT_F", "p-value"]]

#Calculating the male and female percentage
gender_india_1["male-percentage"] = gender_india_1["Male Exact one lang"] * 100 / gender_india_1["TOT_M"]
gender_india_1["female-percentage"] = gender_india_1["Female Exact one lang"] * 100 / gender_india_1["TOT_F"]

#Droping the unimportant columns
gender_india_1.drop(["Male Exact one lang", "Female Exact one lang", "TOT_M", "TOT_F"], axis = 1, inplace = True)

#Renaming the columns name and reordering them
gender_india_1 = gender_india_1.rename(columns = {"State Code" : "state-code"})
gender_india_1 = gender_india_1[["state-code", "male-percentage", "female-percentage", "p-value"]]
gender_india_1.head()

Unnamed: 0,state-code,male-percentage,female-percentage,p-value
0,0,71.650118,76.464264,0.340433
1,1,45.264207,56.811956,0.537034
2,2,80.356377,83.482596,0.30633
3,3,50.242833,56.111899,0.568812
4,4,44.391497,45.86583,0.733277


In [10]:
#Extracting only useful columns
gender_india_2 = df_final.loc[:, ["State Code", "Male Exact two lang", "Female Exact two lang", "TOT_M", "TOT_F", "p-value"]]

#Calculating the male and female percentage
gender_india_2["male-percentage"] = gender_india_2["Male Exact two lang"] * 100 / gender_india_2["TOT_M"]
gender_india_2["female-percentage"] = gender_india_2["Female Exact two lang"] * 100 / gender_india_2["TOT_F"]

#Droping the unimportant columns
gender_india_2.drop(["Male Exact two lang", "Female Exact two lang", "TOT_M", "TOT_F"], axis = 1, inplace = True)

#Renaming the columns name and reordering them
gender_india_2 = gender_india_2.rename(columns = {"State Code" : "state-code"})
gender_india_2 = gender_india_2[["state-code", "male-percentage", "female-percentage", "p-value"]]
gender_india_2.head()

Unnamed: 0,state-code,male-percentage,female-percentage,p-value
0,0,20.241548,17.498692,0.340433
1,1,35.780514,28.995278,0.537034
2,2,13.88586,12.177476,0.30633
3,3,19.811803,17.589983,0.568812
4,4,24.857964,23.926519,0.733277


In [11]:
#Extracting only useful columns
gender_india_3 = df_final.loc[:, ["State Code", "Male Exact three lang", "Female Exact three lang", "TOT_M", "TOT_F", "p-value"]]

#Calculating the male and female percentage
gender_india_3["male-percentage"] = gender_india_3["Male Exact three lang"] * 100 / gender_india_3["TOT_M"]
gender_india_3["female-percentage"] = gender_india_3["Female Exact three lang"] * 100 / gender_india_3["TOT_F"]

#Droping the unimportant columns
gender_india_3.drop(["Male Exact three lang", "Female Exact three lang", "TOT_M", "TOT_F"], axis = 1, inplace = True)

#Renaming the columns name and reordering them
gender_india_3 = gender_india_3.rename(columns = {"State Code" : "state-code"})
gender_india_3 = gender_india_3[["state-code", "male-percentage", "female-percentage", "p-value"]]
gender_india_3.head()

Unnamed: 0,state-code,male-percentage,female-percentage,p-value
0,0,8.108334,6.037044,0.340433
1,1,18.955279,14.192766,0.537034
2,2,5.757763,4.339928,0.30633
3,3,29.945363,26.298118,0.568812
4,4,30.750539,30.207651,0.733277


In [12]:
#Stroing the final results
gender_india_1.to_csv("../Dataset/Output_Dataset/gender-india-a.csv", index = False)
gender_india_2.to_csv("../Dataset/Output_Dataset/gender-india-b.csv", index = False)
gender_india_3.to_csv("../Dataset/Output_Dataset/gender-india-c.csv", index = False)