In [14]:
import pandas as pd
import numpy as np
import requests
import os
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor

In [2]:
def fetch_page_titles(letter):
    subject = 'List of films: ' + letter
    url = 'https://en.wikipedia.org/w/api.php'

    params = {
        'action': 'query',
        'format': 'json',
        'titles': subject,
        'prop': 'links',
        'pllimit': 'max',
        'redirects':''
    }

    response = requests.get(url=url, params=params)
    data = response.json()
    page_titles = []

    while 'continue' in data:
        for key, val in data['query']['pages'].items():
            for link in val['links']:
                page_titles.append(link['title'])
        plcontinue = data['continue']['plcontinue']
        params['plcontinue'] = plcontinue
        response = requests.get(url=url, params=params)
        data = response.json()

    for key, val in data['query']['pages'].items():
        for link in val['links']:
            page_titles.append(link['title'])

    return page_titles, letter


movie_list = pd.DataFrame(columns=['title', 'letter', 'plot'])
letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I",
           "J-K", "L", "M", "N-O", "P", "Q-R", "S", "T",
           "U-W", "X-Z", "numbers"]

with ThreadPoolExecutor() as executor:
    results = list(executor.map(fetch_page_titles, letters))

for page_titles, letter in results:
    plot = np.NaN
    for n, title in enumerate(page_titles):
        movie_list.loc[str(letter + "_" + str(n))] = [title, letter, plot]

    print(movie_list.shape)

(2368, 3)
(5545, 3)
(8065, 3)
(10472, 3)
(11504, 3)
(13316, 3)
(14741, 3)
(16733, 3)
(18307, 3)
(23102, 3)
(25052, 3)
(28818, 3)
(34512, 3)
(36554, 3)
(39944, 3)
(44084, 3)
(46987, 3)
(53909, 3)
(55422, 3)
(55979, 3)


In [3]:
movie_list.to_csv("/content/movies.csv", index=False)

# Getting Plots

In [15]:
from multiprocessing import Pool, cpu_count
import concurrent.futures
from tqdm.auto import tqdm
tqdm.pandas()

In [17]:
movie_list = pd.read_csv("movies.csv")

In [18]:
def fetch_plot(title):
    try:
        url = 'https://en.wikipedia.org/w/api.php'
        params = {
                    'action': 'parse',
                    'page': title,
                    'format': 'json',
                    'prop':'text',
                    # 'section':1,
                    'redirects':''
                }

        response = requests.get(url, params=params)

        data = response.json()

        raw_html = data['parse']['text']['*']
        soup = BeautifulSoup(raw_html,'html.parser')
        soup.find_all('p')
        text = ''

        for tag in soup.find_all(class_="reference"):
            tag.decompose()

        for p in soup.find_all():
            if p.text.startswith('Cite error: '):
                continue
            text += p.text

        plot_regex = re.compile(r"\nPlot(.*?)\n(.+?)\n\n", re.DOTALL)
        return plot_regex.search(text).group(2)

    except:
        return np.NaN

In [None]:
import asyncio
import aiohttp

async def fetch_plots_async(titles):
    sem = asyncio.Semaphore(200)  # Limit the number of concurrent requests to 200
    async with sem:
        async with aiohttp.ClientSession() as session:
            tasks = [fetch_plot(title, session) for title in titles]
            return await asyncio.gather(*tasks)

loop = asyncio.get_event_loop()
titles = movie_list['title'].tolist()
plots = loop.run_until_complete(fetch_plots_async(titles))

movie_list['plot'] = plots

In [None]:
movie_list.to_csv("movies_plot.csv", index=False)

In [None]:
movie_list["plot"].isna().sum()