# CSE621 Web Crawling Examples
---
Prepared by: Kyle Spurlock

Spring 2023

University of Louisville

---

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from IPython.display import display, Image
import pickle
import time
import json
import re
import sys

sys.path.append("../")

In [6]:
movies_df = pd.read_csv("../data/auxiliary_dataset.csv", encoding="latin-1")
movies_df["ReleaseYear"] = movies_df["ReleaseYear"].astype("str")
movies_df

Unnamed: 0,MovieTitle,ReleaseYear,Genres
0,Show Me Love,1998,"Comedy,Drama,Romance"
1,Mad Max: Fury Road,2015,"Action,Adventure,Sci-Fi"
2,Love Me If You Dare,2003,"Comedy,Drama,Romance"
3,"Black Cat, White Cat",1998,"Comedy,Crime,Romance"
4,Central Station,1998,Drama
...,...,...,...
258,End of Watch,2012,"Action,Adventure,Crime"
259,The Perks of Being a Wallflower,2012,Drama
260,The Double Life of Veronique,1991,"Drama,Fantasy,Music"
261,The Princess and the Warrior,2000,"Drama,Mystery,Romance"


To crawl Wikipedia we will mainly be using the [requests library](https://pypi.org/project/requests/) with the Wikipedia API endpoint. Requests is similar to Python's std library urllib4, or wget. Of course there are many tools that also make this process even simpler. [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) is another helpful library for parsing retrieved HTML from requests. 

In [9]:
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
    "action": "query",
    "generator": "embeddedin",
    "format": "json",
    "geititle": "Template:Infobox_film",  # Per the API docs, some parameters must be prepended with a g when using a generator
}

all_movie_pages = {}

while True:
    response = S.get(url=URL, params=PARAMS)
    result = response.json()
    if "error" in result:
        raise SystemError(result["error"])
    if "warnings" in result:
        print(result["warnings"])
    if "query" in result:
        all_movie_pages.update(result["query"]["pages"])
    if "continue" not in result:
        break
    else:
        PARAMS["geicontinue"] = result["continue"]["geicontinue"]
    
    print(f"Items collected: {len(all_movie_pages)}", end="\r")

Items collected: 155110

Saving these for later:

In [10]:
with open("../data/pickle_jar/all_movie_pages.pickle", "wb") as handle:
    pickle.dump(all_movie_pages, handle)

In [8]:
with open("../data/pickle_jar/all_movie_pages.pickle", "rb") as handle:
    all_movie_pages = pickle.load(handle)

In [11]:
len(all_movie_pages)

155115

In [12]:
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
    "action": "query",
    "format": "json",
    "list": "search",
    "srsearch": "", # Fill this in in the loop
}

wiki_page_ids = {}

for i, row in movies_df.iterrows():
    title, year, genres = row.values
    PARAMS["srsearch"] = title + year
    
    response = S.get(url=URL, params=PARAMS)
    result = response.json()
    
    if "error" in result:
        raise SystemError(result["error"])
    if "warnings" in result:
        print(result["warnings"])
    if "query" in result: 
        if result["query"]["searchinfo"]["totalhits"] > 0:
            queries = result["query"]["search"]
            for query in queries:
                try:
                    pageid = query["pageid"]
                    all_movie_pages[str(pageid)] # The lookup table part
                    # If we go past this, we know that the query is a movie
                    #print(f"Match found for: {title} \t Pageid: {pageid}")
                    entry = {"title": title, "year": year, "pageid": pageid}
                    wiki_page_ids[title] = entry
                    break
                except KeyError as e:
                    continue

In [13]:
len(wiki_page_ids)

238

Saving for later again:

In [14]:
with open("../data/pickle_jar/aux_movie_wiki_id.pickle", "wb") as handle:
    pickle.dump(wiki_page_ids, handle)

In [16]:
with open("../data/pickle_jar/aux_movie_wiki_id.pickle", "rb") as handle:
    wiki_page_ids = pickle.load(handle)

In [15]:
wiki_page_ids

{'Show Me Love': {'title': 'Show Me Love', 'year': '1998', 'pageid': 11585},
 'Mad Max: Fury Road': {'title': 'Mad Max: Fury Road',
  'year': '2015',
  'pageid': 36426373},
 'Love Me If You Dare': {'title': 'Love Me If You Dare',
  'year': '2003',
  'pageid': 1511944},
 'Black Cat, White Cat': {'title': 'Black Cat, White Cat',
  'year': '1998',
  'pageid': 1031231},
 'Central Station': {'title': 'Central Station',
  'year': '1998',
  'pageid': 1223479},
 'Hero': {'title': 'Hero', 'year': '2002', 'pageid': 318542},
 'Sicario': {'title': 'Sicario', 'year': '2015', 'pageid': 43221463},
 'Blue Is the Warmest Color': {'title': 'Blue Is the Warmest Color',
  'year': '2013',
  'pageid': 39159786},
 'The Road Warrior': {'title': 'The Road Warrior',
  'year': '1981',
  'pageid': 262465},
 'Hotel Mumbai': {'title': 'Hotel Mumbai', 'year': '2018', 'pageid': 51315175},
 'Talk to Her': {'title': 'Talk to Her', 'year': '2002', 'pageid': 1946017},
 'The Hangover Part II': {'title': 'The Hangover Part

In [17]:
import copy


wiki_page_ids_plots = copy.deepcopy(wiki_page_ids)

S = requests.Session()

URL = "https://en.wikipedia.org/w/index.php" # Note this is different from before!

PARAMS = {
    "curid": None
}

counter = 1
size = len(wiki_page_ids_plots)

for title, attr in wiki_page_ids_plots.items():
    PARAMS["curid"] = attr["pageid"]
    
    response = S.get(url=URL, params=PARAMS)
    soup = BeautifulSoup(response.content)
    
    try:
        current_element = soup.find("table", {"class": "infobox"})
    except AttributeError as e:
        print(attr["title"])
        pass
    
    all_paragraph_string = ""
    h_count = 0
    print(f"{np.round(100*(counter / size),3)}%", end="\r")
    counter += 1
    while True:
        current_element = current_element.next_sibling
        if current_element == "\n":
            pass
        else:
            if current_element.name == "p":
                paragraph_string = ""
                for string in current_element.strings:
                    paragraph_string += string
                all_paragraph_string += paragraph_string
                
                
            elif current_element.name == "meta":
                current_element=current_element.find_next("h2")
                all_paragraph_string += "\nPlot: "
                pass
            elif current_element.name == "h2":
                break
            else:
                break
            
    wiki_page_ids_plots[title]["Description"] = all_paragraph_string

100.0%%

In [19]:
wiki_page_ids_plots

{'Show Me Love': {'title': 'Show Me Love',
  'year': '1998',
  'pageid': 11585,
  'Description': 'Fucking Åmål (released in some countries as Show me Love) is a 1998 Swedish romantic comedy-drama film written and directed by Lukas Moodysson in his feature-length directorial debut. It stars Rebecka Liljeberg and Alexandra Dahlström as two seemingly disparate teenage girls who begin a tentative romantic relationship. The film was released theatrically in Sweden on 23 October 1998,[2] and first premiered internationally at the 1998 Cannes Film Festival.\nIt received an overwhelmingly-positive reception[3] and won four Guldbagge Awards (Sweden\'s official film awards) at the 1999 ceremony. Its international awards include the Teddy Award at the 1999 Berlin International Film Festival,[4] and the Special Jury Prize at the 34th Karlovy Vary International Film Festival.\nThe Swedish title refers to the small town of Åmål in Västra Götaland County, western Sweden. However, only a few scenes we

In [21]:
with open("../data/pickle_jar/aux_wiki_page_ids_plots.pickle", "wb") as f:
    pickle.dump(wiki_page_ids_plots, f)

In [31]:
wiki_df = pd.DataFrame.from_dict(wiki_page_ids_plots, orient="index").reset_index(drop=True)
wiki_df = wiki_df.rename({"title": "MovieTitle", "year": "ReleaseYear"}, axis=1)
wiki_df = wiki_df.drop("pageid", axis=1)
wiki_df = movies_df.merge(wiki_df.drop("ReleaseYear", axis=1), on="MovieTitle", how="outer")
wiki_df = wiki_df.fillna("")
wiki_df

Unnamed: 0,MovieTitle,ReleaseYear,Genres,Description
0,Show Me Love,1998,"Comedy,Drama,Romance",Fucking Åmål (released in some countries as Sh...
1,Mad Max: Fury Road,2015,"Action,Adventure,Sci-Fi",Mad Max: Fury Road is a 2015 Australian post-a...
2,Love Me If You Dare,2003,"Comedy,Drama,Romance","Love Me If You Dare (French: Jeux d'enfants, l..."
3,"Black Cat, White Cat",1998,"Comedy,Crime,Romance","Black Cat, White Cat (Serbian: Црна мачка, бел..."
4,Central Station,1998,Drama,Central Station (Portuguese: Central do Brasil...
...,...,...,...,...
258,End of Watch,2012,"Action,Adventure,Crime",End of Watch is a 2012 American action thrille...
259,The Perks of Being a Wallflower,2012,Drama,The Perks of Being a Wallflower is a 2012 Amer...
260,The Double Life of Veronique,1991,"Drama,Fantasy,Music",The Double Life of Veronique (French: La doubl...
261,The Princess and the Warrior,2000,"Drama,Mystery,Romance",The Princess and the Warrior (German: Der Krie...


In [55]:
old_aux = pd.read_csv(r"../data/old_aux_dataset/auxiliary_dataset_wiki.csv")
old_aux = old_aux.rename({"title": "MovieTitle", "year": "ReleaseYear", "genres":"Genres"}, axis=1)
aux_full = pd.concat((old_aux, wiki_df), axis=0)

In [56]:
aux_full

Unnamed: 0,MovieTitle,ReleaseYear,Genres,Description
0,The Social Network,2010,"Biography,Drama",The Social Network is a 2010 American biograph...
1,Inception,2010,"Action,Adventure,Sci-Fi",Inception is a 2010 science fiction action fi...
2,The Empire Strikes Back,1992,"Action,Adventure,Sci-Fi",The Empire Strikes Back (also known as Star Wa...
3,Star Wars: Episode IV - A New Hope,1977,"Action,Adventure,Fantasy",Star Wars (retroactively titled Star Wars: Epi...
4,The Grand Budapest Hotel,2014,"Adventure,Comedy,Crime",The Grand Budapest Hotel is a 2014 comedy-dram...
...,...,...,...,...
258,End of Watch,2012,"Action,Adventure,Crime",End of Watch is a 2012 American action thrille...
259,The Perks of Being a Wallflower,2012,Drama,The Perks of Being a Wallflower is a 2012 Amer...
260,The Double Life of Veronique,1991,"Drama,Fantasy,Music",The Double Life of Veronique (French: La doubl...
261,The Princess and the Warrior,2000,"Drama,Mystery,Romance",The Princess and the Warrior (German: Der Krie...


In [54]:
aux_full.to_csv("../data/auxiliary_dataset_wiki.csv", index=False)