Here I will try to scrape the Wikipedia page for the Ariane 5 rocket, to get the table with the whole list of its launches.

In [1]:
# import what's apparently needed
import pandas as pd
import numpy as np

In [2]:
# where we are getting the table:
a5url = 'https://en.wikipedia.org/wiki/Ariane_5'

In [3]:
# read the Ariane 5 page and get the tables into a list of tables (apparently):
a5tables = pd.read_html(a5url)

In [4]:
# check which table is the one of interest
a5tables[2].head()

Unnamed: 0,#,Flight no.,DateTime (UTC),Rocket typeSerial no.,Payload,Total payload mass (including launch adapters and SYLDA),Orbit,Customers,Launchoutcome
0,1,V-88[69],4 June 199612:34,G501,Cluster,,,,Failure
1,1,V-88[69],,,,,,,
2,2,V-101,30 October 199713:43,G502,"MaqSat-H, TEAMSAT, MaqSat-B, YES",,,,Partial failure[70]
3,2,V-101,,,,,,,
4,3,V-112,21 October 199816:37,G503,"MaqSat 3, ARD","~6,800 kg",GTO,,Success


In [5]:
# extract the desired table from the list of them
a5lt = a5tables[2]

In [6]:
a5lt

Unnamed: 0,#,Flight no.,DateTime (UTC),Rocket typeSerial no.,Payload,Total payload mass (including launch adapters and SYLDA),Orbit,Customers,Launchoutcome
0,1,V-88[69],4 June 199612:34,G501,Cluster,,,,Failure
1,1,V-88[69],,,,,,,
2,2,V-101,30 October 199713:43,G502,"MaqSat-H, TEAMSAT, MaqSat-B, YES",,,,Partial failure[70]
3,2,V-101,,,,,,,
4,3,V-112,21 October 199816:37,G503,"MaqSat 3, ARD","~6,800 kg",GTO,,Success
...,...,...,...,...,...,...,...,...,...
207,108,VA-252,18 February 202022:18,ECA5111,JCSAT-17GEO-KOMPSAT 2B,"9,236 kg",GTO,SKY Perfect JSATKARI,Success
208,109,VA-253,15 August 202022:04,ECA5112,Galaxy 30MEV-2BSAT-4b,"10,468 kg[106]including 765 kg of support stru...",GTO,IntelsatNorthrop GrummanB-SAT,Success
209,110,VA-254,30 July 202121:00,ECA5113,Eutelsat QuantumStar One D2,"10,515 kg",GTO,EutelsatStar One,Success
210,111,VA-255,24 October 202102:10,ECA5115,SES-17Syracuse 4A,"11,210 kg[107]",GTO,SES S.A.DGA,Success


In [7]:
# rename columns with shorter names
a5lt.rename(columns={'Flight no.': 'flight', 'DateTime (UTC)': 'date', 'Rocket typeSerial no.': 'serial', 'Total payload mass (including launch adapters and SYLDA)': 'mass'}, inplace=True)

In [8]:
# rename columns with shorter names
a5lt.rename(columns={'Payload': 'payload', 'Orbit': 'orbit', 'Customers': 'customers', 'Launchoutcome': 'outcome'}, inplace=True)

In [9]:
a5lt

Unnamed: 0,#,flight,date,serial,payload,mass,orbit,customers,outcome
0,1,V-88[69],4 June 199612:34,G501,Cluster,,,,Failure
1,1,V-88[69],,,,,,,
2,2,V-101,30 October 199713:43,G502,"MaqSat-H, TEAMSAT, MaqSat-B, YES",,,,Partial failure[70]
3,2,V-101,,,,,,,
4,3,V-112,21 October 199816:37,G503,"MaqSat 3, ARD","~6,800 kg",GTO,,Success
...,...,...,...,...,...,...,...,...,...
207,108,VA-252,18 February 202022:18,ECA5111,JCSAT-17GEO-KOMPSAT 2B,"9,236 kg",GTO,SKY Perfect JSATKARI,Success
208,109,VA-253,15 August 202022:04,ECA5112,Galaxy 30MEV-2BSAT-4b,"10,468 kg[106]including 765 kg of support stru...",GTO,IntelsatNorthrop GrummanB-SAT,Success
209,110,VA-254,30 July 202121:00,ECA5113,Eutelsat QuantumStar One D2,"10,515 kg",GTO,EutelsatStar One,Success
210,111,VA-255,24 October 202102:10,ECA5115,SES-17Syracuse 4A,"11,210 kg[107]",GTO,SES S.A.DGA,Success


In [10]:
# drop duplicate rows that appear when there's a row partilly divided in the vertical direction (a part of the row becomes two)
a5lt.drop_duplicates(subset=['date', 'serial', 'payload'], keep=False, inplace = True)

In [11]:
a5lt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 0 to 211
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   #          132 non-null    int64 
 1   flight     132 non-null    object
 2   date       132 non-null    object
 3   serial     132 non-null    object
 4   payload    132 non-null    object
 5   mass       128 non-null    object
 6   orbit      130 non-null    object
 7   customers  83 non-null     object
 8   outcome    132 non-null    object
dtypes: int64(1), object(8)
memory usage: 10.3+ KB


In [12]:
a5lt["notes"] = np.nan

In [13]:
a5lt.head()

Unnamed: 0,#,flight,date,serial,payload,mass,orbit,customers,outcome,notes
0,1,V-88[69],4 June 199612:34,G501,Cluster,,,,Failure,
2,2,V-101,30 October 199713:43,G502,"MaqSat-H, TEAMSAT, MaqSat-B, YES",,,,Partial failure[70],
4,3,V-112,21 October 199816:37,G503,"MaqSat 3, ARD","~6,800 kg",GTO,,Success,
6,4,V-119,10 December 199914:32,G504,XMM-Newton,"3,800 kg",HEO,,Success,
8,5,V-128,21 March 200023:28[71],G505,INSAT-3BAsiaStar,"~5,800 kg",GTO,,Success,


In [14]:
# this is 'a bit' crappy, because of the drop method changing the ammount of indeces
# there will come a day when I improve this, but it is not this day
for i in range(0, a5lt.shape[0]):
    if a5lt.iloc[i, 2] == a5lt.iloc[i, 3] == a5lt.iloc[i, 4]:
        a5lt.iloc[i-1, 9] = a5lt.iloc[i, 2]
        a5lt.drop(a5lt.index[i], inplace = True)

IndexError: single positional indexer is out-of-bounds

In [15]:
# cast every field into strings
a5lt = a5lt.astype(str)

In [16]:
a5lt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112 entries, 0 to 211
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   #          112 non-null    object
 1   flight     112 non-null    object
 2   date       112 non-null    object
 3   serial     112 non-null    object
 4   payload    112 non-null    object
 5   mass       112 non-null    object
 6   orbit      112 non-null    object
 7   customers  112 non-null    object
 8   outcome    112 non-null    object
 9   notes      112 non-null    object
dtypes: object(10)
memory usage: 9.6+ KB


But it still shows the data type as object...

In [17]:
# remove all citations:
for i in range(0, a5lt.shape[1]):
    a5lt.iloc[:,i] = a5lt.iloc[:,i].str.replace(r"\[.*?\]","")

  a5lt.iloc[:,i] = a5lt.iloc[:,i].str.replace(r"\[.*?\]","")


Right now regex means nothing to me, but I guess I will have to checks what this is

In [18]:
# remove all nans:
for i in range(0, a5lt.shape[1]):
    a5lt.iloc[:,i] = a5lt.iloc[:,i].str.replace(r"nan","")

In [19]:
a5lt

Unnamed: 0,#,flight,date,serial,payload,mass,orbit,customers,outcome,notes
0,1,V-88,4 June 199612:34,G501,Cluster,,,,Failure,
2,2,V-101,30 October 199713:43,G502,"MaqSat-H, TEAMSAT, MaqSat-B, YES",,,,Partial failure,
4,3,V-112,21 October 199816:37,G503,"MaqSat 3, ARD","~6,800 kg",GTO,,Success,
6,4,V-119,10 December 199914:32,G504,XMM-Newton,"3,800 kg",HEO,,Success,
8,5,V-128,21 March 200023:28,G505,INSAT-3BAsiaStar,"~5,800 kg",GTO,,Success,
...,...,...,...,...,...,...,...,...,...,...
207,108,VA-252,18 February 202022:18,ECA5111,JCSAT-17GEO-KOMPSAT 2B,"9,236 kg",GTO,SKY Perfect JSATKARI,Success,
208,109,VA-253,15 August 202022:04,ECA5112,Galaxy 30MEV-2BSAT-4b,"10,468 kgincluding 765 kg of support structures.",GTO,IntelsatNorthrop GrummanB-SAT,Success,
209,110,VA-254,30 July 202121:00,ECA5113,Eutelsat QuantumStar One D2,"10,515 kg",GTO,EutelsatStar One,Success,
210,111,VA-255,24 October 202102:10,ECA5115,SES-17Syracuse 4A,"11,210 kg",GTO,SES S.A.DGA,Success,


In [20]:
# Re-arrange date/time
for i in range(0, a5lt.shape[0]):
    tempsplit = a5lt.iloc[i, 2].split()
    a5lt.iloc[i, 2] = tempsplit[1]+' '+tempsplit[0]+', '+tempsplit[2][0:4]+' || '+tempsplit[2][4:]

In [21]:
a5lt.head()

Unnamed: 0,#,flight,date,serial,payload,mass,orbit,customers,outcome,notes
0,1,V-88,"June 4, 1996 || 12:34",G501,Cluster,,,,Failure,
2,2,V-101,"October 30, 1997 || 13:43",G502,"MaqSat-H, TEAMSAT, MaqSat-B, YES",,,,Partial failure,
4,3,V-112,"October 21, 1998 || 16:37",G503,"MaqSat 3, ARD","~6,800 kg",GTO,,Success,
6,4,V-119,"December 10, 1999 || 14:32",G504,XMM-Newton,"3,800 kg",HEO,,Success,
8,5,V-128,"March 21, 2000 || 23:28",G505,INSAT-3BAsiaStar,"~5,800 kg",GTO,,Success,


### Save as CSV

In [22]:
a5lt.to_csv('ariane5LaunchTable.csv', index = False)

### Now to get the HTML code needed for the RR

In [23]:
# create table html code
this_table_name = 'Ariane 5 Full Launch Table'
with open(f'{this_table_name}.txt', 'w') as f:
    f.write('')

In [24]:
# open figure tab and table tab
with open(f'{this_table_name}.txt', 'a') as f:
    f.write('<figure class="wp-block-table">')
    f.write('\n	<table>')

In [25]:
# create header from DF header
# open header tag:
with open(f'{this_table_name}.txt', 'a') as f:
    f.write('\n		<thead>')
# open row tag:
    f.write('\n			<tr>')
# create each header row element
for cheader in [*a5lt]:
    with open(f'{this_table_name}.txt', 'a') as f:
        f.write(f'\n				<th>{cheader}</th>')
# close row tag:
with open(f'{this_table_name}.txt', 'a') as f:
    f.write('\n			</tr>')
# close header tag:
    f.write('\n		</thead>')

In [26]:
# create body from DF rows
# open body tag:
with open(f'{this_table_name}.txt', 'a') as f:
    f.write('\n		<tbody>')

for i in range(0, a5lt.shape[0]):
# open row tag:
    with open(f'{this_table_name}.txt', 'a') as f:
        f.write('\n			<tr>')
# create each body row element
    for relement in [*a5lt.iloc[i, :]]:
        with open(f'{this_table_name}.txt', 'a') as f:
            f.write(f'\n				<td>{relement}</td>')
# close row tag:
    with open(f'{this_table_name}.txt', 'a') as f:
        f.write('\n			</tr>')

# close body tag:
with open(f'{this_table_name}.txt', 'a') as f:
    f.write('\n		</tbody>')

In [27]:
# close table tab and figure tab
with open(f'{this_table_name}.txt', 'a') as f:
    f.write('\n	</table>')
    f.write('\n</figure>')