In [140]:
# import necessary libraries
import pandas as pd
import numpy as np
% matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup  
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
import plotly.graph_objs as go
from plotly import tools

## DATA GATHERING

In [141]:
url = 'https://en.wikipedia.org/wiki/Demographics_of_India'
response = requests.get(url)
response

<Response [200]>

In [142]:
#create beautifulsoup
soup = BeautifulSoup(response.content, 'lxml')

In [143]:
# get all tables data - id starts from 0.

all_tables = soup.find_all('table', class_ = 'wikitable sortable')
print(f"Number of tables : {len(all_tables)}")

Number of tables : 18


In [144]:
all_tables[0]

<table class="wikitable sortable" style="text-align:right">
<tbody><tr>
<th>Years</th>
<th>1880</th>
<th>1881</th>
<th>1882</th>
<th>1883</th>
<th>1884</th>
<th>1885</th>
<th>1886</th>
<th>1887</th>
<th>1888</th>
<th>1889</th>
<th>1890</th>
<th>1902<sup class="reference" id="cite_ref-ourworldindata.org_33-1"><a href="#cite_note-ourworldindata.org-33">[33]</a></sup>
</th></tr>
<tr>
<td align="left">Total Fertility Rate in India</td>
<td style="text-align:right; color:blue;">5.95</td>
<td style="text-align:right; color:blue;">5.92</td>
<td style="text-align:right; color:blue;">5.89</td>
<td style="text-align:right; color:blue;">5.86</td>
<td style="text-align:right; color:blue;">5.82</td>
<td style="text-align:right; color:blue;">5.79</td>
<td style="text-align:right; color:blue;">4.38</td>
<td style="text-align:right; color:blue;">5.76</td>
<td style="text-align:right; color:blue;">5.76</td>
<td style="text-align:right; color:blue;">5.75</td>
<td style="text-align:right; color:blue;">5.

#### FUNCTION -
*get_data* function returns a list with only text.


In [145]:
def get_data(all_data):
    df_list = []
    for data in all_data:
        df_list.append(data.text.strip())
    return df_list

In [146]:
'121121341'[0::3]

'113'

#### FUNCTION - 
*get_dataframe* function collects soup for **individual table.**
It gathers data under `td` then creates a list for each column in respective table.

To create dataframe we need *dictionary* structure, therefore create a dictionary *df_dict* which takes i in every column.

`sample` : it contains the table number returned by all_tables.

`num`    : it contains number of columns the table has, inspecting the page.

In [147]:
def get_dataframe(sample,num):
    all_data = sample.find_all('td')
    df_list = []
    for i in range(num):
        df_list.append(all_data[i::num])
        
    df_dict = {i:[] for i in range(num)}
    for i in range(len(df_dict)):
        data = get_data(df_list[i])
        df_dict[i] = data
    title = collect_col_names(sample)
    return df_dict,title

In [148]:
# collect_col_names function collectes column name of each table
def collect_col_names(sample):
    all_data = sample.find_all('th')
    col_title=  [] 
    for header in all_data:
        col_title.append(header.text.strip())
    return col_title

In [149]:
pop_under_british_dict,pop_under_british_cols = get_dataframe(all_tables[2],3)

pop_per_decade_dict,pop_per_decade_cols = get_dataframe(all_tables[3],3)

pop_dist_by_states_dict,pop_dist_by_states_cols = get_dataframe(all_tables[4],12)

pop_bet_age_0_6_dict,pop_bet_age_0_6_cols = get_dataframe(all_tables[7],6)

pop_abv_7_dict,pop_abv_7_cols = get_dataframe(all_tables[8],5)

literacy_rate_dict,literacy_rate_cols = get_dataframe(all_tables[9],5)

native_speakers_dict,native_speakers_cols = get_dataframe(all_tables[10],4)

un_stat_dict,un_stat_cols = get_dataframe(all_tables[11],9)

census_dict,census_cols = get_dataframe(all_tables[12],9)

pop_struc_dict,pop_struc_cols = get_dataframe(all_tables[13],5)

pop_struc_2016_dict,pop_struc_2016_cols = get_dataframe(all_tables[14],4)

fertility_rate_dict,fertility_rate_cols = get_dataframe(all_tables[15],7)

crude_birth_rate_dict,crude_birth_rate_cols = get_dataframe(all_tables[16],7)

regional_stats_dict,regional_stats_cols = get_dataframe(all_tables[17],13)

In [150]:
pop_under_british_dict

{0: ['1871[35]',
  '1881[36]',
  '1891[35]',
  '1901[35]',
  '1911[37]',
  '1921[37]',
  '1931[37]',
  '1941[37]'],
 1: ['238,830,958',
  '253,896,330',
  '287,223,431',
  '293,550,310',
  '315,156,396',
  '318,942,480',
  '352,837,778',
  '388,997,955'],
 2: ['–', '6.3', '13.1', '2.2', '7.4', '1.2', '10.6', '10.2']}

In [151]:
pop_under_british_cols

['Census year', 'Population', 'Growth (%)']

In [152]:
lc = soup.find_all('table', class_ = 'navbox')[0].find_all('td')[1:]
largest_cities = []
for i in lc:
    largest_cities.append(i.text.strip())

index = largest_cities.index('BangaloreHyderabad')

del largest_cities[index]

df_dict = {}
for i in range(1,4):
    df_dict[i] = largest_cities[i::4]

    
largest_cities_df = pd.DataFrame(df_dict)
largest_cities_df.columns = ['city','state','population']
largest_cities_df.head()

Unnamed: 0,city,state,population
0,Mumbai,Maharashtra,12478447
1,Kanpur,Uttar Pradesh,2920067
2,Delhi,Delhi,11007835
3,Lucknow,Uttar Pradesh,2901474
4,Bangalore,Karnataka,8425970


In [153]:
life_expec = soup.find_all('table',class_ = 'wikitable')[18]
life_dict = {i:[] for i in range(2)}
life_dict

life_expectaion = []
for i in range(0,2):
    life_expectaion.append(life_expec.find_all('td')[i::2])
    
for i in range(len(life_dict)):
    data = get_data(life_expectaion[i])
    life_dict[i] = data
    
life_expe_df = pd.DataFrame(life_dict)
life_expe_df.columns = ['period','life_expec_in_years']
life_expe_df.head()

Unnamed: 0,period,life_expec_in_years
0,1950–1955,36.6
1,1985–1990,56.7
2,1955–1960,39.7
3,1990–1995,59.1
4,1960–1965,42.7


## DATA CLEANING

*create_dataframe* function return a dataframe by a taking dictionary and column names. 

In [154]:
def create_dataframe(df,cols):
    df=pd.DataFrame(df)
    df.columns = cols
    return df

### Dataframe has population and growth % under british raj.

In [155]:
pop_under_british_df = create_dataframe(pop_under_british_dict,pop_under_british_cols)
print(f'Number of obseration:{pop_under_british_df.shape}')
pop_under_british_df.head()

Number of obseration:(8, 3)


Unnamed: 0,Census year,Population,Growth (%)
0,1871[35],238830958,–
1,1881[36],253896330,6.3
2,1891[35],287223431,13.1
3,1901[35],293550310,2.2
4,1911[37],315156396,7.4


### Dataframe has information about population growth per decade.

In [156]:
pop_per_decade_df = create_dataframe(pop_per_decade_dict, pop_per_decade_cols)
print(f'Number of observations:{pop_per_decade_df.shape}')
pop_per_decade_df.head()

Number of observations:(7, 3)


Unnamed: 0,Census year,Population,Change (%)
0,1951,361088000,–
1,1961,439235000,21.6
2,1971,548160000,24.8
3,1981,683329000,24.7
4,1991,846387888,23.9


### Dataframe has information about population distribution by states.

In [157]:
print(pop_dist_by_states_cols[:-12])

['Rank', 'State/UT', 'Population[51]', 'Percent (%)', 'Male', 'Female', 'Difference between male and female', 'Sex Ratio', 'Rural[52]', 'Urban[52]', 'Area[53] (km2)', 'Density (per km2)']


In [158]:
pop_dist_by_states_df = create_dataframe(pop_dist_by_states_dict, pop_dist_by_states_cols[:-12])
print(f'Number of observations:{pop_dist_by_states_df.shape}')
pop_dist_by_states_df.head()

Number of observations:(36, 12)


Unnamed: 0,Rank,State/UT,Population[51],Percent (%),Male,Female,Difference between male and female,Sex Ratio,Rural[52],Urban[52],Area[53] (km2),Density (per km2)
0,1,Uttar Pradesh,199812341,16.5,104480510,95331831,9148679,930,155111022,44470455,240928,828
1,2,Maharashtra,112374333,9.28,58243056,54131277,4111779,929,61545441,50827531,307713,365
2,3,Bihar,104099452,8.6,54278157,49821295,4456862,918,92075028,11729609,94163,1102
3,4,West Bengal,91276115,7.54,46809027,44467088,2341939,950,62213676,29134060,88752,1030
4,5,Madhya Pradesh,72626809,6.0,37612306,35014503,2597803,931,52537899,20059666,308245,236


### Dataframe has information about population distribution between age 0 and 6 by states.

In [159]:
pop_bet_age_0_6_cols[:-6]

['State or UT code', 'State or UT', 'Total', 'Male', 'Female', 'Difference']

In [160]:
pop_bet_age_0_6_df = create_dataframe(pop_bet_age_0_6_dict, pop_bet_age_0_6_cols[:-6])
print(f'Number of observations:{pop_bet_age_0_6_df.shape}')
pop_bet_age_0_6_df.head()

Number of observations:(35, 6)


Unnamed: 0,State or UT code,State or UT,Total,Male,Female,Difference
0,1,Jammu and Kashmir,2008670,1080662,927982,152680
1,2,Himachal Pradesh,763864,400681,363183,37498
2,3,Punjab,2941570,1593262,1348308,244954
3,4,Chandigarh,117953,63187,54766,8421
4,5,Uttarakhand,1328844,704769,624075,80694


### Dataframe has information about population distribution above age 7 by states.

In [161]:
pop_abv_7_cols[:-5]


['State or UT code', 'State or UT', 'Total', 'Male', 'Female']

In [162]:
pop_abv_7_df = create_dataframe(pop_abv_7_dict,pop_abv_7_cols[:-5])
print(f"Number of observations:{pop_abv_7_df.shape}")
pop_abv_7_df.head()

Number of observations:(35, 5)


Unnamed: 0,State or UT code,State or UT,Total,Male,Female
0,1,Jammu and Kashmir,–,–,–
1,2,Himachal Pradesh,–,–,–
2,3,Punjab,–,–,–
3,4,Chandigarh,–,–,–
4,5,Uttarakhand,–,–,–


### Dataframe has information about literacy rate by states.

In [163]:
literacy_rate_cols[:-5]

['State or UT code', 'State or UT', 'Overall (%)', 'Male (%)', 'Female (%)']

In [164]:
literacy_rate_df = create_dataframe(literacy_rate_dict,literacy_rate_cols[:-5])
print(f"Number of observations:{literacy_rate_df.shape}")
literacy_rate_df.head()

Number of observations:(35, 5)


Unnamed: 0,State or UT code,State or UT,Overall (%),Male (%),Female (%)
0,1,Jammu and Kashmir,86.61,87.26,86.23
1,2,Himachal Pradesh,83.78,90.83,76.6
2,3,Punjab,76.6,81.48,71.34
3,4,Chandigarh,86.43,90.54,81.38
4,5,Uttarakhand,79.63,88.33,70.7


### Dataframe has information about languages of india by number of native speakers at the 2001 census.

In [165]:
native_speakers_df  = create_dataframe(native_speakers_dict,native_speakers_cols)
print(f"Number of observations:{native_speakers_df.shape}")
native_speakers_df.head()

Number of observations:(29, 4)


Unnamed: 0,Rank,Language,Speakers,Percentage (%)
0,1,Hindi[74],422048642,41.03
1,2,Bengali,83369769,8.11
2,3,Telugu,74002856,7.19
3,4,Marathi,71936894,6.99
4,5,Tamil,60793814,5.91


### Dataframe has information about United Nations, World Population Prospects: The 2015 revision – India.

In [166]:
del un_stat_dict[0][-1]
un_stat_df = create_dataframe(un_stat_dict,un_stat_cols)
print(f"Number of observations:{un_stat_df.shape}")
un_stat_df.head()

Number of observations:(13, 9)


Unnamed: 0,Period,Births per year,Deaths per year,Natural change per year,CBR1,CDR1,NC1,TFR1,IMR1
0,1950–1955,16832000,9928000,6904000,43.3,25.5,17.7,5.9,165.0
1,1955–1960,17981000,9686000,8295000,42.1,22.7,19.4,5.9,153.1
2,1960–1965,19086000,9358000,9728000,40.4,19.8,20.6,5.82,140.1
3,1965–1970,20611000,9057000,11554000,39.2,17.2,22.0,5.69,128.5
4,1970–1975,22022000,8821000,13201000,37.5,15.0,22.5,5.26,118.0


### Dataframe has information about Census of India.

In [167]:
del census_dict[0][-1]
census_df = create_dataframe(census_dict,census_cols)
print(f"Number of observations:{census_df.shape}")
census_df.head()

Number of observations:(36, 9)


Unnamed: 0,Year,Average population(x 1000),Live births1,Deaths1,Natural change,Crude birth rate(per 1000),Crude death rate(per 1000),Natural change(per 1000),Total fertility rate
0,1981,716493,24289000,8956000,15333000,33.9,12.5,21.4,–
1,1982,733152,24781000,8725000,16056000,33.8,11.9,21.9,–
2,1983,750034,25276000,8925000,16351000,33.7,11.9,21.8,–
3,1984,767147,26006000,9666000,16340000,33.9,12.6,21.3,–
4,1985,784491,25810000,9257000,16553000,32.9,11.8,21.1,–


### Dataframe has structure of the population (09.02.2011) (Census) (Includes data for the Indian-administered part of Jammu and Kashmir).

In [168]:
pop_struc_df = create_dataframe(pop_struc_dict,pop_struc_cols[:-5])
print(f"Number of observations:{pop_struc_df.shape}")
pop_struc_df.head()

Number of observations:(22, 5)


Unnamed: 0,Age group,Male,Female,Total,Percentage (%)
0,0–4,58632074,54174704,112806778,9.32
1,5–9,66300466,60627660,126928126,10.48
2,10–14,69418835,63290377,132709212,10.96
3,15–19,63982396,56544053,120526449,9.95
4,20–24,57584693,53839529,111424222,9.2


### Dataframe has population pyramid 2016 (estimates).

In [169]:
pop_struc_2016_df = create_dataframe(pop_struc_2016_dict,pop_struc_2016_cols )
print(f"Number of observations:{pop_struc_2016_df.shape}")
pop_struc_2016_df.head()

Number of observations:(21, 4)


Unnamed: 0,Age group,Male,Female,Total
0,0–4,8.7,8.2,8.5
1,5–9,9.1,8.8,8.9
2,10–14,9.8,9.4,9.6
3,15–19,10.4,9.9,10.1
4,20–24,10.2,10.7,10.4


In [170]:
del crude_birth_rate_dict[0][-1]
del crude_birth_rate_dict[1][-1]
crude_birth_rate_df = create_dataframe(crude_birth_rate_dict,crude_birth_rate_cols)
print(f"Number of observations:{crude_birth_rate_df.shape}")
crude_birth_rate_df.head()

Number of observations:(29, 7)


Unnamed: 0,State (Population 2011),CBR – Total,TFR – Total1,CBR – Urban,TFR – Urban1,CBR – Rural,TFR – Rural1
0,Uttar Pradesh (199 812 341),22.6,2.74 (2.06),18.6,2.08 (1.62),24.0,2.99 (2.22)
1,Maharashtra (112 374 333),16.6,1.87 (1.57),15.5,1.68 (1.41),17.5,2.06 (1.73)
2,Bihar (104 099 452),27.1,3.41 (2.48),20.4,2.42 (1.83),28.0,3.56 (2.58)
3,West Bengal (91 276 115),16.6,1.77 (1.53),14.0,1.57 (1.38),18.0,1.85 (1.58)
4,Madhya Pradesh (72 626 809),20.2,2.32 (1.82),17.7,1.95 (1.61),21.3,2.48 (1.91)


In [171]:
del fertility_rate_dict[0][-1]
fertility_rate_df = create_dataframe(fertility_rate_dict,fertility_rate_cols)
print(f"Number of observations:{fertility_rate_df.shape}")
fertility_rate_df.head()

Number of observations:(4, 7)


Unnamed: 0,Year,CBR – Total,TFR – Total1,CBR – Urban,TFR – Urban1,CBR – Rural,TFR – Rural1
0,1992–1993,28.7,3.39 (2.64),24.1,2.70 (2.09),30.4,3.67 (2.86)
1,1998–1999,24.8,2.85 (2.13),20.9,2.27 (1.73),26.2,3.07 (2.28)
2,2005–2006,23.1,2.68 (1.90),18.8,2.06 (1.60),25.0,2.98 (2.10)
3,2015–2016,19.0,2.18 (1.8),15.8,1.75 (1.5),20.7,2.41 (1.9)


### Birth rate, death rate, natural growth rate, and infant mortality rate, by state or UT(2010).

In [172]:
regional_stats_df = pd.DataFrame(regional_stats_dict)
regional_stats_df.columns = ['state','birth_rate_total','birth_rate_rural','birth_rate_urban','death_rate_total','death_rate_rural','death_rate_urban','natural_growth_rate_total','natural_growth_rate_rural','natural_growth_rate_urban','infant_moratatily_rate_total','infant_moratatily_rate_rural','infant_moratatily_rate_urban']
print(f"Number of observations:{regional_stats_df.shape}")
regional_stats_df.head()

Number of observations:(35, 13)


Unnamed: 0,state,birth_rate_total,birth_rate_rural,birth_rate_urban,death_rate_total,death_rate_rural,death_rate_urban,natural_growth_rate_total,natural_growth_rate_rural,natural_growth_rate_urban,infant_moratatily_rate_total,infant_moratatily_rate_rural,infant_moratatily_rate_urban
0,Andaman and Nicobar Islands,15.6,15.5,15.8,4.3,4.8,3.3,11.3,10.7,12.6,25,29,18
1,Andhra Pradesh,17.9,18.3,16.7,7.6,8.6,5.4,10.2,9.7,11.3,46,51,33
2,Arunachal Pradesh,20.5,22.1,14.6,5.9,6.9,2.3,14.6,15.2,12.3,31,34,12
3,Assam,23.2,24.4,15.8,8.2,8.6,5.8,14.9,15.8,10.1,58,60,36
4,Bihar,28.1,28.8,22.0,6.8,7.0,5.6,21.3,21.8,16.4,48,49,38


- Values in columns are separated by commas. Eg. Population column in pop_under_british_df
- Missing data is represented by '-'
- Multiple information is available in one column
- Total fertitliy column has '~' before number
- dtype of certain columns should be int or float instead of string object

Copy Dataframes into another to move to cleaning process.

In [173]:
pop_under_british_df_clean = pop_under_british_df.copy()
regional_stats_df_clean = regional_stats_df.copy()
fertility_rate_df_clean = fertility_rate_df.copy()
crude_birth_rate_df_clean = crude_birth_rate_df.copy()
pop_struc_2016_df_clean = pop_struc_2016_df.copy()
pop_struc_df_clean =  pop_struc_df.copy()
census_df_clean =  census_df.copy()
un_stat_df_clean =  un_stat_df.copy()
native_speakers_df_clean =  native_speakers_df.copy()
literacy_rate_df_clean =  literacy_rate_df.copy()
pop_abv_7_df_clean =  pop_abv_7_df.copy()
pop_bet_age_0_6_df_clean =  pop_bet_age_0_6_df.copy()
pop_dist_by_states_df_clean =  pop_dist_by_states_df.copy()
pop_per_decade_df_clean = pop_per_decade_df.copy()
life_expe_df_clean = life_expe_df.copy()
largest_cities_df_clean = largest_cities_df.copy()

- Get rows containing '~' and replace them with space.

In [176]:
mask = census_df_clean['Total fertility rate'].str.contains('~')
census_df_clean.loc[mask, 'Total fertility rate'] = census_df_clean.loc[mask, 'Total fertility rate'].str.replace('~', '')

In [177]:
census_df_clean['Total fertility rate'].tail()

31     2.4
32     2.3
33     2.3
34     2.2
35     2.2
Name: Total fertility rate, dtype: object

- Get rows having '-' and put NaN 

In [243]:
df_missing_list1 = [pop_abv_7_df_clean, census_df_clean, pop_under_british_df_clean,
                  pop_per_decade_df_clean, un_stat_df_clean]

errors_missing1 = {}
for df in df_missing_list1:
    cols = df.columns
    for col in cols:
        try:
            masks = df.loc[:,col] == '–'
            df.loc[masks, col] = np.nan
        except Exception as e:
            errors_missing1[col] = str(e)

In [244]:
errors_missing1

{}

In [245]:
pop_under_british_df_clean.head()

Unnamed: 0,Census year,Population,Growth (%)
0,1871[35],238830958,
1,1881[36],253896330,6.3
2,1891[35],287223431,13.1
3,1901[35],293550310,2.2
4,1911[37],315156396,7.4


#### Data Type

##### Few of the columns in dataframes are integer type but stored as object also the number are separated by commas.

- Create a dictionary with dataframe and an assigned key value.
- Apply a for loop to dictionry to take column names of each df in a variable and apply a for loop to check if resp column contains a comma. If yes, then replace it and change the dtype.
- if error occurs, collect it in another dictionary and handle it separately.

In [249]:
df_dict = {0:pop_under_british_df_clean,
           1:regional_stats_df_clean,
           2:fertility_rate_df_clean,
           3:crude_birth_rate_df_clean,
           4:pop_struc_2016_df_clean,
           5:pop_struc_df_clean,
           6:census_df_clean,
           7:un_stat_df_clean,
           8:native_speakers_df_clean,
           9:literacy_rate_df_clean,
           10:pop_abv_7_df_clean,
           11:pop_bet_age_0_6_df_clean,
           12:pop_dist_by_states_df_clean,
           13:pop_per_decade_df_clean,
           14:life_expe_df_clean,
           15:largest_cities_df_clean}

In [250]:
errors = {}
for key , df in df_dict.items():
    cols = df.columns
    for col in cols:
        if (df[col].str.contains(',')).any():
            try:
                df[col] = df[col].str.replace(',','').astype(int)
            except Exception as e:
                errors[key] = col

In [254]:
print(errors)

{6: 'Natural change', 7: 'Natural change per year', 10: 'Female', 15: 'population'}


#### TEST

In [257]:
print(f"Before Cleaning\n{pop_under_british_df.dtypes}")
print(f"After Cleaning\n{pop_under_british_df_clean.dtypes}")

Before Cleaning
Census year    object
Population     object
Growth (%)     object
dtype: object
After Cleaning
Census year    object
Population      int32
Growth (%)     object
dtype: object


Population columns have now changed to integer type.

## Exploratory Analysis


In [258]:
def att_line_plot(X,Y,T,XA,YA):
    trace = go.Scatter(
     x = X,
     y =Y,
     mode = 'lines+markers',
     marker = dict(size = (Y/ Y.mean())*50,
                    color = np.random.randn(len(Y)),
                    opacity = 0.6,
                    line = dict( color = 'rgb(0,0,0)'))
    
    )
    
    layout = dict(title = T, 
                  xaxis = dict(title = XA),
                  yaxis = dict(title = YA))
    
    data = [trace]
    fig = {'data':data,'layout':layout}
    return fig

In [259]:
#Census year has [35] after a year hence remove it.
pop_under_british_df_clean['Census year'] = pop_under_british_df_clean['Census year'].apply(lambda x:x.split('[')[0])

FIG = att_line_plot(pop_under_british_df_clean['Census year'],pop_under_british_df_clean['Population'],'Population Growth under British Raj','Year','Population in Millions')
iplot(FIG)

- Population is increased by 63% in 70 years. 1870 -1940.

In [260]:
FIG = att_line_plot(pop_per_decade_df_clean['Census year'],pop_per_decade_df_clean['Population'],'Population Growth after Independence','Year','Population in Billions')
iplot(FIG)

- In next 60 years population increased by 235%.
- From 1970 to 2010 population almost doubled to 1.2 Billion.
- Decade population growth rate was 1.77%.(2001 to 2010)