In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
% matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup  
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
import plotly.graph_objs as go
from plotly import tools

## DATA GATHERING

In [2]:
url = 'https://en.wikipedia.org/wiki/Demographics_of_India'
response = requests.get(url)
response

<Response [200]>

In [3]:
#create beautifulsoup
soup = BeautifulSoup(response.content, 'lxml')

In [14]:
# get all tables data - id starts from 0.

all_tables = soup.find_all('table', class_ = 'wikitable sortable')
print(f"Number of tables : {len(all_tables)}")

Number of tables : 18


In [15]:
all_tables[0]

<table class="wikitable sortable" style="text-align:right">
<tbody><tr>
<th>Years</th>
<th>1880</th>
<th>1881</th>
<th>1882</th>
<th>1883</th>
<th>1884</th>
<th>1885</th>
<th>1886</th>
<th>1887</th>
<th>1888</th>
<th>1889</th>
<th>1890</th>
<th>1902<sup class="reference" id="cite_ref-ourworldindata.org_33-1"><a href="#cite_note-ourworldindata.org-33">[33]</a></sup>
</th></tr>
<tr>
<td align="left">Total Fertility Rate in India</td>
<td style="text-align:right; color:blue;">5.95</td>
<td style="text-align:right; color:blue;">5.92</td>
<td style="text-align:right; color:blue;">5.89</td>
<td style="text-align:right; color:blue;">5.86</td>
<td style="text-align:right; color:blue;">5.82</td>
<td style="text-align:right; color:blue;">5.79</td>
<td style="text-align:right; color:blue;">4.38</td>
<td style="text-align:right; color:blue;">5.76</td>
<td style="text-align:right; color:blue;">5.76</td>
<td style="text-align:right; color:blue;">5.75</td>
<td style="text-align:right; color:blue;">5.

#### FUNCTION -
*get_data* function returns a list with only text.


In [8]:
def get_data(all_data):
    df_list = []
    for data in all_data:
        df_list.append(data.text.strip())
    return df_list

In [49]:
'121121341'[0::3]

'113'

#### FUNCTION - 
*get_dataframe* function collects soup for **individual table.**
It gathers data under `td` then creates a list for each column in respective table.

To create dataframe we need *dictionary* structure, therefore create a dictionary *df_dict* which takes i in every column.

`sample` : it contains the table number returned by all_tables.

`num`    : it contains number of columns the table has, inspecting the page.

In [52]:
def get_dataframe(sample,num):
    all_data = sample.find_all('td')
    df_list = []
    for i in range(num):
        df_list.append(all_data[i::num])
        
    df_dict = {i:[] for i in range(num)}
    for i in range(len(df_dict)):
        data = get_data(df_list[i])
        df_dict[i] = data
    title = collect_col_names(sample)
    return df_dict,title

In [53]:
# collect_col_names function collectes column name of each table
def collect_col_names(sample):
    all_data = sample.find_all('th')
    col_title=  [] 
    for header in all_data:
        col_title.append(header.text.strip())
    return col_title

In [54]:
pop_under_british_dict,pop_under_british_cols = get_dataframe(all_tables[2],3)

pop_per_decade_dict,pop_per_decade_cols = get_dataframe(all_tables[3],3)

pop_dist_by_states_dict,pop_dist_by_states_cols = get_dataframe(all_tables[4],12)

pop_bet_age_0_6_dict,pop_bet_age_0_6_cols = get_dataframe(all_tables[7],6)

pop_abv_7_dict,pop_abv_7_cols = get_dataframe(all_tables[8],5)

literacy_rate_dict,literacy_rate_cols = get_dataframe(all_tables[9],5)

native_speakers_dict,native_speakers_cols = get_dataframe(all_tables[10],4)

un_stat_dict,un_stat_cols = get_dataframe(all_tables[11],9)

census_dict,census_cols = get_dataframe(all_tables[12],9)

pop_struc_dict,pop_struc_cols = get_dataframe(all_tables[13],5)

pop_struc_2016_dict,pop_struc_2016_cols = get_dataframe(all_tables[14],4)

fertility_rate_dict,fertility_rate_cols = get_dataframe(all_tables[15],7)

crude_birth_rate_dict,crude_birth_rate_cols = get_dataframe(all_tables[16],7)

regional_stats_dict,regional_stats_cols = get_dataframe(all_tables[17],13)

In [57]:
pop_under_british_dict

{0: ['1871[35]',
  '1881[36]',
  '1891[35]',
  '1901[35]',
  '1911[37]',
  '1921[37]',
  '1931[37]',
  '1941[37]'],
 1: ['238,830,958',
  '253,896,330',
  '287,223,431',
  '293,550,310',
  '315,156,396',
  '318,942,480',
  '352,837,778',
  '388,997,955'],
 2: ['–', '6.3', '13.1', '2.2', '7.4', '1.2', '10.6', '10.2']}

In [58]:
pop_under_british_cols

['Census year', 'Population', 'Growth (%)']