### Compare Scraped SECC Data to Aggregated Output Posted on the SECC site

We compare the original scraped data to state total obtained from https://secc.gov.in/welcome

The state total CSV is posted [here](../data/secc_state.csv).

In [1]:
import pandas as pd
from io import StringIO

In [2]:
our = """state  hh
arunachal pradesh     1189638
assam                 7114232
bihar                54197114
chhattisgarh          8536248
gujarat              20474063
haryana               7928621
kerala               17805905
madhya pradesh       33788308
maharashtra       41188662
mizoram             120572
nagaland            854973
odisha            31567430
punjab             8848364
rajasthan         28359145
sikkim              266484
tamilnadu         30675056
uttar pradesh     74437606
uttarakhand        4425566
west bengal       48835606
"""

In [3]:
df1 = pd.read_csv(StringIO(our), delimiter='\s{2,}', engine="python")
df1

Unnamed: 0,state,hh
0,arunachal pradesh,1189638
1,assam,7114232
2,bihar,54197114
3,chhattisgarh,8536248
4,gujarat,20474063
5,haryana,7928621
6,kerala,17805905
7,madhya pradesh,33788308
8,maharashtra,41188662
9,mizoram,120572


In [4]:
secc = """state	hh
01 - JAMMU & KASHMIR	2094081
02 - HIMACHAL PRADESH	1427365
03 - PUNJAB	5032199
04 - CHANDIGARH	214233
05 - UTTARAKHAND	1968773
06 - HARYANA	4630959
07 - NCT OF DELHI	3391313
08 - RAJASTHAN	13136591
09 - UTTAR PRADESH	32475784
10 - BIHAR	20074242
11 - SIKKIM	120014
12 - ARUNACHAL PRADESH	260217
13 - NAGALAND	379164
14 - MANIPUR	578939
15 - MIZORAM	226147
16 - TRIPURA	875621
17 - MEGHALAYA	554131
18 - ASSAM	6427614
19 - WEST BENGAL	20367144
20 - JHARKHAND	6041931
21 - ODISHA	9942101
22 - CHHATTISGARH	5714798
23 - MADHYA PRADESH	14723864
24 - GUJARAT	11629409
25 - DAMAN AND DIU	44968
26 - DADRA & NAGAR HAVELI	66571
27 - MAHARASHTRA	22962600
28 - ANDHRA PRADESH	12270164
28 - TELANGANA	8244441
29 - KARNATAKA	13139063
30 - GOA	302950
31 - LAKSHADWEEP	10929
32 - KERALA	7698556
33 - TAMILNADU	17521956
34 - PUDUCHERRY	279857
35 - ANDAMAN & NICOBAR ISLANDS	92717"""

In [5]:
df2 = pd.read_csv(StringIO(secc.lower()), delimiter='\t')
df2

Unnamed: 0,state,hh
0,01 - jammu & kashmir,2094081
1,02 - himachal pradesh,1427365
2,03 - punjab,5032199
3,04 - chandigarh,214233
4,05 - uttarakhand,1968773
5,06 - haryana,4630959
6,07 - nct of delhi,3391313
7,08 - rajasthan,13136591
8,09 - uttar pradesh,32475784
9,10 - bihar,20074242


In [6]:
import fuzzywuzzy.process as fwp

choices = list(df2.state)

def fmatch(row): 
    minscore=85 #or whatever score works for you
    choice,score = fwp.extractOne(row.state,choices)
    return choice if score > minscore else None

df1['df2_state'] = df1.apply(fmatch,axis=1)
mdf = pd.merge(df1, 
                  df2,
                  left_on='df2_state',
                  right_on='state',
                  suffixes=['_df1','_df2'],
                  how = 'outer') # assuming you want to keep unmatched records
mdf



Unnamed: 0,state_df1,hh_df1,df2_state,state_df2,hh_df2
0,arunachal pradesh,1189638.0,12 - arunachal pradesh,12 - arunachal pradesh,260217
1,assam,7114232.0,18 - assam,18 - assam,6427614
2,bihar,54197114.0,10 - bihar,10 - bihar,20074242
3,chhattisgarh,8536248.0,22 - chhattisgarh,22 - chhattisgarh,5714798
4,gujarat,20474063.0,24 - gujarat,24 - gujarat,11629409
5,haryana,7928621.0,06 - haryana,06 - haryana,4630959
6,kerala,17805905.0,32 - kerala,32 - kerala,7698556
7,madhya pradesh,33788308.0,23 - madhya pradesh,23 - madhya pradesh,14723864
8,maharashtra,41188662.0,27 - maharashtra,27 - maharashtra,22962600
9,mizoram,120572.0,15 - mizoram,15 - mizoram,226147


In [7]:
mdf['diff(Million)'] = (mdf.hh_df1 - mdf.hh_df2)/1e6
mdf

Unnamed: 0,state_df1,hh_df1,df2_state,state_df2,hh_df2,diff(Million)
0,arunachal pradesh,1189638.0,12 - arunachal pradesh,12 - arunachal pradesh,260217,0.929421
1,assam,7114232.0,18 - assam,18 - assam,6427614,0.686618
2,bihar,54197114.0,10 - bihar,10 - bihar,20074242,34.122872
3,chhattisgarh,8536248.0,22 - chhattisgarh,22 - chhattisgarh,5714798,2.82145
4,gujarat,20474063.0,24 - gujarat,24 - gujarat,11629409,8.844654
5,haryana,7928621.0,06 - haryana,06 - haryana,4630959,3.297662
6,kerala,17805905.0,32 - kerala,32 - kerala,7698556,10.107349
7,madhya pradesh,33788308.0,23 - madhya pradesh,23 - madhya pradesh,14723864,19.064444
8,maharashtra,41188662.0,27 - maharashtra,27 - maharashtra,22962600,18.226062
9,mizoram,120572.0,15 - mizoram,15 - mizoram,226147,-0.105575


In [8]:
mdf[['hh_df1', 'hh_df2']].sum()

hh_df1    420613593.0
hh_df2    244921406.0
dtype: float64

In [10]:
mdf['times'] = mdf.hh_df1 / mdf.hh_df2
mdf

Unnamed: 0,state_df1,hh_df1,df2_state,state_df2,hh_df2,diff(Million),times
0,arunachal pradesh,1189638.0,12 - arunachal pradesh,12 - arunachal pradesh,260217,0.929421,4.571715
1,assam,7114232.0,18 - assam,18 - assam,6427614,0.686618,1.106823
2,bihar,54197114.0,10 - bihar,10 - bihar,20074242,34.122872,2.699834
3,chhattisgarh,8536248.0,22 - chhattisgarh,22 - chhattisgarh,5714798,2.82145,1.493709
4,gujarat,20474063.0,24 - gujarat,24 - gujarat,11629409,8.844654,1.760542
5,haryana,7928621.0,06 - haryana,06 - haryana,4630959,3.297662,1.712091
6,kerala,17805905.0,32 - kerala,32 - kerala,7698556,10.107349,2.312889
7,madhya pradesh,33788308.0,23 - madhya pradesh,23 - madhya pradesh,14723864,19.064444,2.294799
8,maharashtra,41188662.0,27 - maharashtra,27 - maharashtra,22962600,18.226062,1.793728
9,mizoram,120572.0,15 - mizoram,15 - mizoram,226147,-0.105575,0.533158
