In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
% matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup  
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
import plotly.graph_objs as go
from plotly import tools

## DATA GATHERING

In [2]:
url = 'https://en.wikipedia.org/wiki/Demographics_of_India'
response = requests.get(url)
response

<Response [200]>

In [3]:
#create beautifulsoup
soup = BeautifulSoup(response.content, 'lxml')

In [14]:
# get all tables data - id starts from 0.

all_tables = soup.find_all('table', class_ = 'wikitable sortable')
print(f"Number of tables : {len(all_tables)}")

Number of tables : 18


In [15]:
all_tables[0]

<table class="wikitable sortable" style="text-align:right">
<tbody><tr>
<th>Years</th>
<th>1880</th>
<th>1881</th>
<th>1882</th>
<th>1883</th>
<th>1884</th>
<th>1885</th>
<th>1886</th>
<th>1887</th>
<th>1888</th>
<th>1889</th>
<th>1890</th>
<th>1902<sup class="reference" id="cite_ref-ourworldindata.org_33-1"><a href="#cite_note-ourworldindata.org-33">[33]</a></sup>
</th></tr>
<tr>
<td align="left">Total Fertility Rate in India</td>
<td style="text-align:right; color:blue;">5.95</td>
<td style="text-align:right; color:blue;">5.92</td>
<td style="text-align:right; color:blue;">5.89</td>
<td style="text-align:right; color:blue;">5.86</td>
<td style="text-align:right; color:blue;">5.82</td>
<td style="text-align:right; color:blue;">5.79</td>
<td style="text-align:right; color:blue;">4.38</td>
<td style="text-align:right; color:blue;">5.76</td>
<td style="text-align:right; color:blue;">5.76</td>
<td style="text-align:right; color:blue;">5.75</td>
<td style="text-align:right; color:blue;">5.

#### FUNCTION -
*get_data* function returns a list with only text.


In [8]:
def get_data(all_data):
    df_list = []
    for data in all_data:
        df_list.append(data.text.strip())
    return df_list

In [49]:
'121121341'[0::3]

'113'

#### FUNCTION - 
*get_dataframe* function collects soup for **individual table.**
It gathers data under `td` then creates a list for each column in respective table.

To create dataframe we need *dictionary* structure, therefore create a dictionary *df_dict* which takes i in every column.

`sample` : it contains the table number returned by all_tables.

`num`    : it contains number of columns the table has, inspecting the page.

In [52]:
def get_dataframe(sample,num):
    all_data = sample.find_all('td')
    df_list = []
    for i in range(num):
        df_list.append(all_data[i::num])
        
    df_dict = {i:[] for i in range(num)}
    for i in range(len(df_dict)):
        data = get_data(df_list[i])
        df_dict[i] = data
    title = collect_col_names(sample)
    return df_dict,title

In [53]:
# collect_col_names function collectes column name of each table
def collect_col_names(sample):
    all_data = sample.find_all('th')
    col_title=  [] 
    for header in all_data:
        col_title.append(header.text.strip())
    return col_title

In [54]:
pop_under_british_dict,pop_under_british_cols = get_dataframe(all_tables[2],3)

pop_per_decade_dict,pop_per_decade_cols = get_dataframe(all_tables[3],3)

pop_dist_by_states_dict,pop_dist_by_states_cols = get_dataframe(all_tables[4],12)

pop_bet_age_0_6_dict,pop_bet_age_0_6_cols = get_dataframe(all_tables[7],6)

pop_abv_7_dict,pop_abv_7_cols = get_dataframe(all_tables[8],5)

literacy_rate_dict,literacy_rate_cols = get_dataframe(all_tables[9],5)

native_speakers_dict,native_speakers_cols = get_dataframe(all_tables[10],4)

un_stat_dict,un_stat_cols = get_dataframe(all_tables[11],9)

census_dict,census_cols = get_dataframe(all_tables[12],9)

pop_struc_dict,pop_struc_cols = get_dataframe(all_tables[13],5)

pop_struc_2016_dict,pop_struc_2016_cols = get_dataframe(all_tables[14],4)

fertility_rate_dict,fertility_rate_cols = get_dataframe(all_tables[15],7)

crude_birth_rate_dict,crude_birth_rate_cols = get_dataframe(all_tables[16],7)

regional_stats_dict,regional_stats_cols = get_dataframe(all_tables[17],13)

In [57]:
pop_under_british_dict

{0: ['1871[35]',
  '1881[36]',
  '1891[35]',
  '1901[35]',
  '1911[37]',
  '1921[37]',
  '1931[37]',
  '1941[37]'],
 1: ['238,830,958',
  '253,896,330',
  '287,223,431',
  '293,550,310',
  '315,156,396',
  '318,942,480',
  '352,837,778',
  '388,997,955'],
 2: ['–', '6.3', '13.1', '2.2', '7.4', '1.2', '10.6', '10.2']}

In [58]:
pop_under_british_cols

['Census year', 'Population', 'Growth (%)']

In [62]:
lc = soup.find_all('table', class_ = 'navbox')[0].find_all('td')[1:]
largest_cities = []
for i in lc:
    largest_cities.append(i.text.strip())

index = largest_cities.index('BangaloreHyderabad')

del largest_cities[index]

df_dict = {}
for i in range(1,4):
    df_dict[i] = largest_cities[i::4]

    
largest_cities_df = pd.DataFrame(df_dict)
largest_cities_df.columns = ['city','state','population']
largest_cities_df.head()

Unnamed: 0,city,state,population
0,Mumbai,Maharashtra,12478447
1,Kanpur,Uttar Pradesh,2920067
2,Delhi,Delhi,11007835
3,Lucknow,Uttar Pradesh,2901474
4,Bangalore,Karnataka,8425970


In [63]:
life_expec = soup.find_all('table',class_ = 'wikitable')[18]
life_dict = {i:[] for i in range(2)}
life_dict

life_expectaion = []
for i in range(0,2):
    life_expectaion.append(life_expec.find_all('td')[i::2])
    
for i in range(len(life_dict)):
    data = get_data(life_expectaion[i])
    life_dict[i] = data
    
life_expe_df = pd.DataFrame(life_dict)
life_expe_df.columns = ['period','life_expec_in_years']
life_expe_df.head()

Unnamed: 0,period,life_expec_in_years
0,1950–1955,36.6
1,1985–1990,56.7
2,1955–1960,39.7
3,1990–1995,59.1
4,1960–1965,42.7


## DATA CLEANING

*create_dataframe* function return a dataframe by a taking dictionary and column names. 

In [65]:
def create_dataframe(df,cols):
    df=pd.DataFrame(df)
    df.columns = cols
    return df

### Dataframe has population and growth % under british raj.

In [68]:
pop_under_british_df = create_dataframe(pop_under_british_dict,pop_under_british_cols)
print(f'Number of obseration:{pop_under_british_df.shape}')
pop_under_british_df.head()

Number of obseration:(8, 3)


Unnamed: 0,Census year,Population,Growth (%)
0,1871[35],238830958,–
1,1881[36],253896330,6.3
2,1891[35],287223431,13.1
3,1901[35],293550310,2.2
4,1911[37],315156396,7.4


### Dataframe has information about population growth per decade.

In [73]:
pop_per_decade_df = create_dataframe(pop_per_decade_dict, pop_per_decade_cols)
print(f'Number of observations:{pop_per_decade_df.shape}')
pop_per_decade_df.head()

Number of observations:(7, 3)


Unnamed: 0,Census year,Population,Change (%)
0,1951,361088000,–
1,1961,439235000,21.6
2,1971,548160000,24.8
3,1981,683329000,24.7
4,1991,846387888,23.9


### Dataframe has information about population distribution by states.

In [74]:
print(pop_dist_by_states_cols[:-12])

['Rank', 'State/UT', 'Population[51]', 'Percent (%)', 'Male', 'Female', 'Difference between male and female', 'Sex Ratio', 'Rural[52]', 'Urban[52]', 'Area[53] (km2)', 'Density (per km2)']


In [76]:
pop_dist_by_states_df = create_dataframe(pop_dist_by_states_dict, pop_dist_by_states_cols[:-12])
print(f'Number of observations:{pop_dist_by_states_df.shape}')
pop_dist_by_states_df.head()

Number of observations:(36, 12)


Unnamed: 0,Rank,State/UT,Population[51],Percent (%),Male,Female,Difference between male and female,Sex Ratio,Rural[52],Urban[52],Area[53] (km2),Density (per km2)
0,1,Uttar Pradesh,199812341,16.5,104480510,95331831,9148679,930,155111022,44470455,240928,828
1,2,Maharashtra,112374333,9.28,58243056,54131277,4111779,929,61545441,50827531,307713,365
2,3,Bihar,104099452,8.6,54278157,49821295,4456862,918,92075028,11729609,94163,1102
3,4,West Bengal,91276115,7.54,46809027,44467088,2341939,950,62213676,29134060,88752,1030
4,5,Madhya Pradesh,72626809,6.0,37612306,35014503,2597803,931,52537899,20059666,308245,236


### Dataframe has information about population distribution between age 0 and 6 by states.

In [87]:
pop_bet_age_0_6_cols[:-6]

['State or UT code', 'State or UT', 'Total', 'Male', 'Female', 'Difference']

In [89]:
pop_bet_age_0_6_df = create_dataframe(pop_bet_age_0_6_dict, pop_bet_age_0_6_cols[:-6])
print(f'Number of observations:{pop_bet_age_0_6_df.shape}')
pop_bet_age_0_6_df.head()

Number of observations:(35, 6)


Unnamed: 0,State or UT code,State or UT,Total,Male,Female,Difference
0,1,Jammu and Kashmir,2008670,1080662,927982,152680
1,2,Himachal Pradesh,763864,400681,363183,37498
2,3,Punjab,2941570,1593262,1348308,244954
3,4,Chandigarh,117953,63187,54766,8421
4,5,Uttarakhand,1328844,704769,624075,80694


### Dataframe has information about population distribution above age 7 by states.

In [90]:
pop_abv_7_cols[:-5]


['State or UT code', 'State or UT', 'Total', 'Male', 'Female']

In [91]:
pop_abv_7_df = create_dataframe(pop_abv_7_dict,pop_abv_7_cols[:-5])
print(f"Number of observations:{pop_abv_7_df.shape}")
pop_abv_7_df.head()

Number of observations:(35, 5)


Unnamed: 0,State or UT code,State or UT,Total,Male,Female
0,1,Jammu and Kashmir,–,–,–
1,2,Himachal Pradesh,–,–,–
2,3,Punjab,–,–,–
3,4,Chandigarh,–,–,–
4,5,Uttarakhand,–,–,–


### Dataframe has information about literacy rate by states.

In [92]:
literacy_rate_cols[:-5]

['State or UT code', 'State or UT', 'Overall (%)', 'Male (%)', 'Female (%)']

In [93]:
literacy_rate_df = create_dataframe(literacy_rate_dict,literacy_rate_cols[:-5])
print(f"Number of observations:{literacy_rate_df.shape}")
literacy_rate_df.head()

Number of observations:(35, 5)


Unnamed: 0,State or UT code,State or UT,Overall (%),Male (%),Female (%)
0,1,Jammu and Kashmir,86.61,87.26,86.23
1,2,Himachal Pradesh,83.78,90.83,76.6
2,3,Punjab,76.6,81.48,71.34
3,4,Chandigarh,86.43,90.54,81.38
4,5,Uttarakhand,79.63,88.33,70.7


### Dataframe has information about languages of india by number of native speakers at the 2001 census.

In [94]:
native_speakers_df  = create_dataframe(native_speakers_dict,native_speakers_cols)
print(f"Number of observations:{native_speakers_df.shape}")
native_speakers_df.head()

Number of observations:(29, 4)


Unnamed: 0,Rank,Language,Speakers,Percentage (%)
0,1,Hindi[74],422048642,41.03
1,2,Bengali,83369769,8.11
2,3,Telugu,74002856,7.19
3,4,Marathi,71936894,6.99
4,5,Tamil,60793814,5.91


### Dataframe has information about United Nations, World Population Prospects: The 2015 revision – India.

In [95]:
del un_stat_dict[0][-1]
un_stat_df = create_dataframe(un_stat_dict,un_stat_cols)
print(f"Number of observations:{un_stat_df.shape}")
un_stat_df.head()


Number of observations:(13, 9)


Unnamed: 0,Period,Births per year,Deaths per year,Natural change per year,CBR1,CDR1,NC1,TFR1,IMR1
0,1950–1955,16832000,9928000,6904000,43.3,25.5,17.7,5.9,165.0
1,1955–1960,17981000,9686000,8295000,42.1,22.7,19.4,5.9,153.1
2,1960–1965,19086000,9358000,9728000,40.4,19.8,20.6,5.82,140.1
3,1965–1970,20611000,9057000,11554000,39.2,17.2,22.0,5.69,128.5
4,1970–1975,22022000,8821000,13201000,37.5,15.0,22.5,5.26,118.0
