# Libraries

In [87]:
# import libraries
# ================

# for date and time opeations
from datetime import datetime
# for file and folder operations
import os
# for regular expression opeations
import re
# for listing files in a folder
import glob
# for getting web contents
import requests 
# storing and analysing data
import pandas as pd
# for scraping web contents
from bs4 import BeautifulSoup

# Data

In [88]:
# link of the wikipedia page
link = 'https://en.wikipedia.org/wiki/List_of_Indian_satellites'
# get all tables from the link
all_tables = pd.read_html(link)
# total no. of tables in the page
len(all_tables)

15

In [89]:
# first table
# all_tables[0]

In [90]:
# last table
# all_tables[5]

In [91]:
# container for all the tables
list_of_tables = []

# first 6 tables contains the data that we want
for table_no in range(6):
    
    # create a temporary table
    temp_table = all_tables[table_no]
#     print(temp_table.shape)
    
    # drop mutli index
    temp_table.columns = temp_table.columns.droplevel(0)
    temp_table.columns = temp_table.columns.droplevel(0)
#     print(temp_table.shape)
    
    # create a columns from previous rows
    for i in range(len(temp_table)):
        temp_table.loc[i+1, 'COSPAR ID'] = temp_table.loc[i, 'SatCat #']
        temp_table.loc[i+1, 'Launch Mass'] = temp_table.loc[i, 'Dry Mass']
        temp_table.loc[i+1, 'Note'] = temp_table.loc[i, 'Periapsis']

    # drop rows with even no. index (0, 2, 4 ...) (repeated columns)
    temp_table = temp_table.drop([i for i in range(len(temp_table)) if i%2==0])

    # replace wikipedia link no.s
    temp_table = temp_table.replace({'\[\d+\]' : ''}, regex=True)    
    
    # append temporary table to list_of_tables
    list_of_tables.append(temp_table)
    
#     print(temp_table.columns)
#     print('\n', '='*20, '\n')

In [92]:
# concatenate all the tables to form the full table
full_table = pd.concat(list_of_tables).reset_index(drop=True)

In [93]:
# first few rows
# full_table.head()

In [94]:
# names of columns
# full_table.columns

In [95]:
full_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   #                  122 non-null    object
 1   Name               122 non-null    object
 2   Discipline         122 non-null    object
 3   SatCat #           121 non-null    object
 4   Dry Mass           110 non-null    object
 5   On-board Power     117 non-null    object
 6   Launch Date        122 non-null    object
 7   Launch Vehicle     122 non-null    object
 8   Launch Site        122 non-null    object
 9   Periapsis          118 non-null    object
 10  Apoapsis           119 non-null    object
 11  Semi-Major Axis    117 non-null    object
 12  Period             119 non-null    object
 13  Inclination        119 non-null    object
 14  Longitude‡         120 non-null    object
 15  Eccentricity       112 non-null    object
 16  Epoch Start        120 non-null    object
 1

# Preprocessing

In [96]:
full_table = full_table.drop('Refs(ISRO portal)', axis = 1)

In [97]:
# rearrange columns
full_table = full_table[['#', 'Name', 'Discipline', 'COSPAR ID', 'SatCat #', 'Launch Mass', 'Dry Mass', 
                         'On-board Power', 'Launch Date', 'Launch Vehicle', 'Launch Site', 'Periapsis', 
                         'Apoapsis', 'Semi-Major Axis', 'Period', 'Inclination', 'Longitude‡', 
                         'Eccentricity', 'Epoch Start', 'Decay Date', 'Note']]

# rename columns
full_table.columns = ['#', 'Name', 'Discipline', 'COSPAR ID', 'SatCat #', 'Launch Mass', 'Dry Mass', 
                      'On-board Power', 'Launch Date', 'Launch Vehicle', 'Launch Site', 'Periapsis', 
                      'Apoapsis', 'Semi-Major Axis', 'Period', 'Inclination', 'Longitude', 
                      'Eccentricity', 'Epoch Start', 'Decay Date', 'Note']

full_table.head()

Unnamed: 0,#,Name,Discipline,COSPAR ID,SatCat #,Launch Mass,Dry Mass,On-board Power,Launch Date,Launch Vehicle,...,Periapsis,Apoapsis,Semi-Major Axis,Period,Inclination,Longitude,Eccentricity,Epoch Start,Decay Date,Note
0,1,Aryabhatta,Earth Sciences Space Physics,1975-033A,07752,360 kg (790 lb),–,46 W,"19 April 1975, 13:10:00 IST",Interkosmos-II,...,568 km (353 mi),611 km (380 mi),–,96.5 mins,50.7°,Not Applicable,0.00308,"19 April 1975, 01:30:00 IST",11 February 1992,Active technological experience in building an...
1,2,Bhaskara Sega-I,Astronomy Communications Engineering Earth Sci...,1979-051A,11392,444 kg (979 lb),–,47 W,"7 June 1979, 16:00:00 IST",Modified SS-5 (SKean IRBM) plus Upper Stage,...,512 km (318 mi),557 km (346 mi),–,95.2 mins,50.7°,Not Applicable,0.00325,"7 June 1979, 01:30:00 IST",17 February 1989,First experimental remote sensing satellite. C...
2,3,Rohini Technology Payload,Experimental,Not Applicable,Not Applicable,35 kg (77 lb),–,3 W,10 August 1979,SLV-3-E1,...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Intended for measuring in-flight performance o...
3,4,Rohini RS-1 (Rohini-1B),Earth Sciences,1980-062A,11899,35 kg (77 lb),–,16 W,"18 July 1980, 8:01:00 IST",SLV-3-E2,...,305 km (190 mi),919 km (571 mi),–,96.9 mins,44.7°,Not Applicable,0.04389,"18 July 1980, 1:30:00 IST",20 May 1981,Used for measuring in-flight performance of se...
4,5,Rohini RS-D1 (Rohini-2),Earth Sciences,1981-051A,12491,38 kg (84 lb),–,16 W,"31 May 1981, 10:30:00 IST",SLV-3-D1,...,186 km (116 mi),418 km (260 mi),–,90.5 mins,46.3°,Not Applicable,0.01735,"31 May 1981, 1:30:00 IST",8 June 1981,Used for conducting some remote sensing techno...


In [135]:
# launch mass in kg
full_table['Launch Mass'] = full_table['Launch Mass'].str.extract('(\d+)')

# launch date
full_table['Launch Date'] = full_table['Launch Date'].str.extract('(\d+ [A-Za-z]+ \d{4})')
# full_table.sample(5)

In [143]:
full_table['Launch Date'].str.extract('(\d{1,2})')

Unnamed: 0,0
0,19 April 1975
1,7 June 1979
2,10 August 1979
3,18 July 1980
4,31 May 1981
...,...
117,22 July 2019
118,27 November 2019
119,11 December 2019
120,16 January 2020


In [134]:
# rename launch mass to launch mass in kg