scrapping `The 100 Most Highly-Ranked Films` data from wiki website \
```
https://web.archive.org/web/20230902185655/https://en.everybodywiki.com/100_Most_Highly-Ranked_Films
```
then load it into table in sqlite3 database \

the headers we will ingest is `Average Rank`, `Film`, `Year`, `'Rotten Tomatoes' Top 100` \

we will take only the top 25 films of the 2000s

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sqlite3

In [2]:
url = "https://web.archive.org/web/20230902185655/https://en.everybodywiki.com/100_Most_Highly-Ranked_Films"
db_name = "movies.db"
table_name = "top_25_2000s"

In [3]:
df = pd.DataFrame(columns=["Average Rank", "Film", "Year", "Rotten Tomatoes Top 100"])

In [4]:
page = requests.get(url).text
data = BeautifulSoup(page, "html.parser")

In [5]:
table = data.find_all("tbody")[0] # we need the first tbody element in the web page
rows = table.find_all("tr")

In [6]:
for row in rows:
    col = row.find_all("td")
    if len(col) == 0:
        continue
    film_dict = {
        "Average Rank": float(col[0].contents[0]),
        "Film": str(col[1].text),
        "Year": int(col[2].text) if col[2].text.isnumeric() == True else 0,
        "Rotten Tomatoes Top 100": str(col[3].text)
    }
    df1 = pd.DataFrame(film_dict, index=[0])
    df = pd.concat([df, df1], ignore_index=True)

  df = pd.concat([df, df1], ignore_index=True)


In [7]:
df.head()

Unnamed: 0,Average Rank,Film,Year,Rotten Tomatoes Top 100
0,1.0,The Godfather,1972,17
1,2.0,Citizen Kane,1941,2
2,3.0,Casablanca,1942,8
3,4.0,"The Godfather, Part II",1974,99
4,5.0,Singin' in the Rain,1952,52


In [8]:
df.dtypes

Unnamed: 0,0
Average Rank,float64
Film,object
Year,object
Rotten Tomatoes Top 100,object


In [9]:
df["Year"] = df["Year"].convert_dtypes()

In [10]:
filter = (df["Year"] >= 2000)

In [11]:
df_2000 = df[filter]

In [12]:
df_2000.head()

Unnamed: 0,Average Rank,Film,Year,Rotten Tomatoes Top 100
16,17.0,Parasite,2019,6
18,19.0,Lord of the Rings: The Fellowship of the Ring,2001,unranked
22,23.0,Avengers: Endgame,2019,7
33,34.0,Spider-Man: Into the Spider-verse,2018,19
36,37.0,The Dark Knight,2008,unranked


In [13]:
df_2000.reset_index(drop=True, inplace=True)

In [14]:
df_2000.head()

Unnamed: 0,Average Rank,Film,Year,Rotten Tomatoes Top 100
0,17.0,Parasite,2019,6
1,19.0,Lord of the Rings: The Fellowship of the Ring,2001,unranked
2,23.0,Avengers: Endgame,2019,7
3,34.0,Spider-Man: Into the Spider-verse,2018,19
4,37.0,The Dark Knight,2008,unranked


In [15]:
df_2000_25 = df_2000.iloc[:26]

In [16]:
df_2000_25

Unnamed: 0,Average Rank,Film,Year,Rotten Tomatoes Top 100
0,17.0,Parasite,2019,6
1,19.0,Lord of the Rings: The Fellowship of the Ring,2001,unranked
2,23.0,Avengers: Endgame,2019,7
3,34.0,Spider-Man: Into the Spider-verse,2018,19
4,37.0,The Dark Knight,2008,unranked
5,41.0,Mad Max: Fury Road,2015,18
6,42.0,Inception,2010,unranked
7,43.0,Lord of the Rings: Return of the King,2003,unranked
8,49.0,Lord of the Rings: The Two Towers,2002,unranked
9,55.0,Interstellar,2014,unranked


In [17]:
conn = sqlite3.connect(db_name)
df_2000_25.to_sql(table_name, conn, if_exists="replace", index=False)
conn.close()

In [18]:
conn = sqlite3.connect(db_name)
conn.execute("select * from top_25_2000s").fetchall()

[(17.0, 'Parasite', 2019, '6'),
 (19.0, 'Lord of the Rings: The Fellowship of the Ring', 2001, 'unranked'),
 (23.0, 'Avengers: Endgame', 2019, '7'),
 (34.0, 'Spider-Man: Into the Spider-verse', 2018, '19'),
 (37.0, 'The Dark Knight', 2008, 'unranked'),
 (41.0, 'Mad Max: Fury Road', 2015, '18'),
 (42.0, 'Inception', 2010, 'unranked'),
 (43.0, 'Lord of the Rings: Return of the King', 2003, 'unranked'),
 (49.0, 'Lord of the Rings: The Two Towers', 2002, 'unranked'),
 (55.0, 'Interstellar', 2014, 'unranked'),
 (59.0, 'Avengers: Infinity War', 2018, 'unranked'),
 (60.0, 'Gladiator', 2000, 'unranked'),
 (62.0, 'Get Out', 2017, '15'),
 (63.0, 'Moonlight', 2016, '20'),
 (64.0, 'Spirited Away', 2001, 'unranked'),
 (65.0, 'Lady Bird', 2017, '12'),
 (69.0, 'Black Panther', 2018, '5'),
 (70.0, 'Whiplash', 2014, 'unranked'),
 (76.0, 'Mulholland Drive', 2001, 'unranked'),
 (77.0, 'In the Mood for Love', 2000, 'unranked'),
 (78.0, 'A Quiet Place', 2018, '29'),
 (80.0, 'Coco', 2017, '37'),
 (86.0, 'Lo

In [19]:
conn.close()