In [1]:
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time
import re
import random

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [60.9 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/universe

In [None]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")

driver = webdriver.Chrome(options=options)

def parser(driver):
    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    containers = soup.find_all('li', class_='ipc-metadata-list-summary-item')

    titles = []
    years = []
    ratings = []
    durations = []
    age_limits = []
    movie_ids = []

    for container in containers:
        titl = container.find('h3', class_='ipc-title__text')
        if titl:
            title_text = titl.text.strip()
            clean_title = re.sub(r'^\d+\.\s*', '', title_text)
            titles.append(clean_title)
        else:
            titles.append(None)

        metadata = container.find_all('span', class_='sc-caa65599-7')
        year = metadata[0].text.strip() if len(metadata) > 0 else None
        years.append(year)
        duration = metadata[1].text.strip() if len(metadata) > 1 else None
        durations.append(duration)
        age_limit = metadata[2].text.strip() if len(metadata) > 2 else None
        age_limits.append(age_limit)

        rating = container.find('span', class_='ipc-rating-star--rating')
        if rating:
            ratings.append(rating.text.strip())
        else:
            ratings.append(None)

        link = container.find('a', class_='ipc-title-link-wrapper')
        if link and 'href' in link.attrs:
            href = link['href']
            id = re.search(r'/title/(tt\d+)/', href)
            if id:
                movie_ids.append(id.group(1))
            else:
                movie_ids.append(None)
        else:
            movie_ids.append(None)

    return titles, years, ratings, durations, age_limits, movie_ids

def get_movies(year, num=250):
    all_titles = []
    all_years = []
    all_ratings = []
    all_durations = []
    all_age_limits = []
    all_movie_ids = []

    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31&sort=num_votes,desc"
    driver.get(url)
    time.sleep(3)

    last = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new = driver.execute_script("return document.body.scrollHeight")
        if new == last:
            break
        last = new

    collected = 0
    click = 0

    while collected < num and click < 10:
        titles, years, ratings, durations, age_limits, movie_ids = parser(driver)



        current = set(all_titles)
        for title, year_val, rating, duration, age_limit, movie_id in zip(titles, years, ratings, durations, age_limits, movie_ids):
            if title not in current:
                all_titles.append(title)
                all_years.append(year_val)
                all_ratings.append(rating)
                all_durations.append(duration)
                all_age_limits.append(age_limit)
                all_movie_ids.append(movie_id)

        collected = len(all_titles)



        if collected >= num:
            break

        try:
            more = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.ipc-see-more__button"))
            )
            driver.execute_script("arguments[0].click();", more)
            time.sleep(3)

            last = driver.execute_script("return document.body.scrollHeight")
            while True:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                new = driver.execute_script("return document.body.scrollHeight")
                if new == last:
                    break
                last = new

            click += 1
        except:

            break

    return all_titles[:num], all_years[:num], all_ratings[:num], all_durations[:num], all_age_limits[:num], all_movie_ids[:num]

all = []

for year in tqdm(range(1974, 2025)):
    titles, years_list, ratings, durations, age_limits, movie_ids = get_movies(year, 200)

    for title, year_val, rating, duration, age_limit, movie_id in zip(titles, years_list, ratings, durations, age_limits, movie_ids):
        all.append({
            'title': title,
            'year': year_val,
            'rating_IMDB': rating,
            'duration': duration,
            'age_limit': age_limit,
            'movie_id': movie_id
        })
    time.sleep(2)

driver.quit()

  8%|▊         | 4/51 [02:15<26:25, 33.74s/it]

In [None]:
imdb_df = pd.DataFrame(all)
imdb_df

In [None]:
imdb_df['year'] = imdb_df['year'].astype(int)
imdb_df['rating_IMDB'] = imdb_df['rating_IMDB'].astype(float)

In [None]:
imdb_df['duration'] = (
    imdb_df['duration'].str.extract(r'(\d+)h').fillna(0)[0].astype(int) * 60 +
    imdb_df['duration'].str.extract(r'(\d+)m').fillna(0)[0].astype(int)
).astype(int)
imdb_df

In [None]:
imdb_df.isna().sum()

In [None]:
imdb_df.to_csv('временный_датасет.csv', index=False)

In [None]:
API_KEY = "3976e5da53f5c0777897995e4e29ac9b"

def get_tmdb(year, page=1):
    url = f"https://api.themoviedb.org/3/discover/movie"
    params = {
        'api_key': API_KEY,
        'language': 'en-US',
        'sort_by': 'vote_count.desc',
        'primary_release_year': year,
        'page': page
    }
    response = requests.get(url, params=params)
    return response.json()

def details(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {
        'api_key': API_KEY,
        'language': 'en-US',
        'append_to_response': 'release_dates'
    }
    response = requests.get(url, params=params)
    return response.json()

def age(release_dates):
    if not release_dates or "results" not in release_dates:
        return None
    results = release_dates["results"]

    for res in results:
        if res.get("iso_3166_1") == "US":
            rels = res.get("release_dates", [])
            if rels:
                cert = rels[0].get("certification")
                if cert:
                    return cert

    for res in results:
        for rd in res.get("release_dates", []):
            cert = rd.get("certification")
            if cert:
                return cert
    return None

tmdb_data = []
count = 0
start_time = time.time()

period = 2025 - 1974
total_movies = period * 200

with tqdm(total=total_movies) as pbar:
    for year in range(1974, 2025):
        try:
            collected = 0
            page = 1

            while collected < 200:
                if count >= 40:
                    time_passed = time.time() - start_time
                    if time_passed < 10:
                        sleep_time = 10 - time_passed
                        time.sleep(sleep_time)
                    count = 0
                    start_time = time.time()

                data = get_tmdb(year, page)
                count += 1

                if not data.get('results'):
                    break

                for movie in data['results']:
                    if collected >= 200:
                        break

                    if count >= 40:
                        time_passed = time.time() - start_time
                        if time_passed < 10:
                            sleep_time = 10 - time_passed
                            time.sleep(sleep_time)
                        count = 0
                        start_time = time.time()

                    movie_details = details(movie['id'])
                    count += 1

                    countries = [country['name'] for country in movie_details.get('production_countries', [])]

                    budget = movie_details.get('budget')
                    revenue = movie_details.get('revenue')
                    duration = movie_details.get('runtime')

                    tmdb_data.append({
                        'title': movie['title'],
                        'year': year,
                        'country': ', '.join(countries) if countries else None,
                        'rating_TMDB': movie['vote_average'],
                        'age_rating': age(movie_details.get('release_dates')),
                        'genres': ', '.join([genre['name'] for genre in movie_details.get('genres', [])]) if movie_details.get('genres') else None,
                        'revenue': revenue if revenue != 0 else None,
                        'budget': budget if budget != 0 else None,
                        'duration': duration if duration != 0 else None
                    })

                    collected += 1
                    pbar.update(1)
                    time.sleep(0.1)

                page += 1
                if page > data['total_pages']:
                    break

        except Exception as e:
            time.sleep(1)

tmdb_df = pd.DataFrame(tmdb_data)

In [None]:
tmdb_df.to_csv('временный_датасет2.0.csv', index=False)

In [None]:
tmdb_df.isna().sum()

In [None]:
tmdb_df['title'] = tmdb_df['title'].str.replace(r'[^\w\s-]', '', regex=True)

In [None]:
tmdb_df

In [None]:
data = pd.merge(
    imdb_df[['title', 'year', 'rating_IMDB', 'duration', 'age_limit', 'movie_id']],
    tmdb_df[['title', 'country', 'rating_TMDB', 'age_rating', 'genres', 'revenue', 'budget']],
    on='title',
    how='left',
)
data

In [None]:
data['title'].duplicated().sum()

In [None]:
data = data.drop_duplicates(subset=['title'], keep='first')
data

In [None]:
data['rating'] = data[['rating_IMDB', 'rating_TMDB']].mean(axis=1, skipna=True)
data = data.drop(['rating_IMDB', 'rating_TMDB'], axis=1)
data

In [None]:
data['age_limit'] = data['age_limit'].fillna(data['age_rating'])
data = data.drop(['age_rating'], axis=1)

In [None]:
data.isna().sum()

In [None]:
def get_countries(movie_id, delay_range=(1, 3)):
    url = f"https://www.imdb.com/title/{movie_id}/"

    time.sleep(random.uniform(*delay_range))

    try:
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

        response = session.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        strana = soup.find('li', {
            'data-testid': 'title-details-origin'
        })

        if strana:
            strana_links = strana.find_all('a', {
                'class': 'ipc-metadata-list-item__list-content-item--link'
            })
            countries = [link.get_text(strip=True) for link in strana_links]
            return countries

        return []

    except requests.RequestException as e:
        return []
    except Exception as e:
        return []

def parser_stran(data, country_col='country', movie_id_col='movie_id', delay_range=(1, 3)):
    df = data.copy()

    nan_mask = df[country_col].isna()
    nan_ind = df[nan_mask].index
    total_nan = len(nan_ind)

    for i, idx in enumerate(tqdm(nan_ind), 1):
        movie_id = df.loc[idx, movie_id_col]

        if pd.notna(movie_id) and movie_id != '':
            countries = get_countries(movie_id, delay_range)

            if countries:
                df.loc[idx, country_col] = ', '.join(countries)
            else:
                df.loc[idx, country_col] = 'Не найдено'
    return df


data = parser_stran(
    data,
    country_col='country',
    movie_id_col='movie_id',
    delay_range=(1, 2)
)

In [None]:
data.isna().sum()

In [None]:
def get_genres(movie_id, delay=(1, 3)):
    url = f"https://www.imdb.com/title/{movie_id}/"

    time.sleep(random.uniform(*delay))

    try:
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        response = session.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        genres_elem = soup.find('li', {'data-testid': 'storyline-genres'})
        if genres_elem:
            genre_links = genres_elem.find_all('a', class_='ipc-metadata-list-item__list-content-item--link')
            genres = [link.get_text(strip=True) for link in genre_links]
            if genres:
                return genres


        genr_links = soup.find_all('a', href=lambda href: href and '/search/title/?genres=' in href)
        genres = []
        for link in genr_links:
            genre_text = link.get_text(strip=True)
            if genre_text and genre_text not in genres:
                genres.append(genre_text)
        if genres:
            return genres

        return []

    except Exception as e:
        return []

def parse_genres(data, genre_col='genres', movie_id_col='movie_id', delay=(1, 3)):
    df = data.copy()

    nan_mask = df[genre_col].isna()
    nan_indices = df[nan_mask].index
    total_nan = len(nan_indices)

    for idx in tqdm(nan_indices):
        movie_id = df.loc[idx, movie_id_col]

        if pd.notna(movie_id) and movie_id != '':
            genres = get_genres(movie_id, delay)

            if genres:
                df.loc[idx, genre_col] = ', '.join(genres)

    return df

data = parse_genres(
    data,
    genre_col='genres',
    movie_id_col='movie_id',
    delay=(1, 2)
)

In [None]:
data.isna().sum()

In [None]:
data

In [None]:
def parse_money(text):
    if not text:
        return None
    try:
        clean = re.sub(r'[^\d.]', '', text)
        return float(clean) if clean else None
    except:
        return None

def get_box_office(movie_id, delay=(1, 3)):
    url = f"https://www.boxofficemojo.com/title/{movie_id}/"

    time.sleep(random.uniform(*delay))

    try:
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        response = session.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        revenue = None
        budget = None


        for pattern in ['Worldwide', 'Worldwide Gross']:
            elements = soup.find_all(string=re.compile(pattern, re.IGNORECASE))
            for element in elements:
                parent = element.parent
                for _ in range(5):
                    if not parent:
                        break
                    money_span = parent.find('span', class_='money')
                    if money_span:
                        revenue = parse_money(money_span.get_text(strip=True))
                        break
                    parent = parent.parent
                if revenue:
                    break
            if revenue:
                break


        for pattern in ['Budget', 'Production Budget']:
            elements = soup.find_all(string=re.compile(pattern, re.IGNORECASE))
            for element in elements:
                parent = element.parent
                for _ in range(3):
                    if not parent:
                        break
                    money_span = parent.find('span', class_='money')
                    if money_span:
                        budget = parse_money(money_span.get_text(strip=True))
                        break
                    parent = parent.parent
                if budget:
                    break
            if budget:
                break

        return {'revenue': revenue, 'budget': budget}

    except Exception:
        return {'revenue': None, 'budget': None}

def parse_box_office(data, revenue_col='revenue', budget_col='budget', movie_id_col='movie_id', delay=(1, 3)):
    df = data.copy()


    nan_revenue = df[revenue_col].isna()
    nan_budget = df[budget_col].isna()
    nan_indices = df[nan_revenue | nan_budget].index

    for idx in tqdm(nan_indices, desc=""):
        movie_id = df.loc[idx, movie_id_col]

        if pd.notna(movie_id) and movie_id != '':
            result = get_box_office(movie_id, delay)

            if pd.isna(df.loc[idx, revenue_col]) and result['revenue']:
                df.loc[idx, revenue_col] = result['revenue']

            if pd.isna(df.loc[idx, budget_col]) and result['budget']:
                df.loc[idx, budget_col] = result['budget']

    return df


data = parse_box_office(
    data,
    revenue_col='revenue',
    budget_col='budget',
    movie_id_col='movie_id',
    delay=(1, 2)
)

In [None]:
data.isna().sum()

In [None]:
def get_imdb_box_office(movie_id, delay=(1, 3)):
    url = f"https://www.imdb.com/title/{movie_id}/"

    time.sleep(random.uniform(*delay))

    try:
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        response = session.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        revenue = None
        budget = None


        box_office = soup.find('div', {'data-testid': 'title-boxoffice-section'})

        if box_office:

            worldwide = box_office.find('li', {
                'data-testid': 'title-boxoffice-cumulativeworldwidegross'
            })
            if worldwide:
                revenue_span = worldwide.find('span', {
                    'class': 'ipc-metadata-list-item__list-content-item'
                })
                if revenue_span:
                    revenue = revenue_span.get_text(strip=True)


        budget_elem = soup.find('li', {'data-testid': 'title-boxoffice-budget'})
        if budget_elem:
            budget_span = budget_elem.find('span', {
                'class': 'ipc-metadata-list-item__list-content-item'
            })
            if budget_span:
                budget = budget_span.get_text(strip=True)

        return {'revenue': revenue, 'budget': budget}

    except Exception:
        return {'revenue': None, 'budget': None}

def parse_imdb_box_office(data, revenue_col='revenue', budget_col='budget', movie_id_col='movie_id', delay=(1, 3)):
    df = data.copy()


    nan_revenue = df[revenue_col].isna()
    nan_budget = df[budget_col].isna()
    nan_indices = df[nan_revenue | nan_budget].index

    for idx in tqdm(nan_indices, desc=""):
        movie_id = df.loc[idx, movie_id_col]

        if pd.notna(movie_id) and movie_id != '':
            result = get_imdb_box_office(movie_id, delay)

            if pd.isna(df.loc[idx, revenue_col]) and result['revenue']:
                df.loc[idx, revenue_col] = result['revenue']

            if pd.isna(df.loc[idx, budget_col]) and result['budget']:
                df.loc[idx, budget_col] = result['budget']

    return df


data = parse_imdb_box_office(
    data,
    revenue_col='revenue',
    budget_col='budget',
    movie_id_col='movie_id',
    delay=(1, 2)
)

In [None]:
exc = {
    '€': {2000: 0.94, 2004: 0.80, 2008: 0.68, 2010: 0.75, 2015: 0.90, 2020: 0.88, 2024: 0.92},
    '£': {1980: 2.33, 1990: 1.65, 2000: 1.52, 2010: 1.54, 2020: 1.29},
    'DEM': {1975: 2.50, 1985: 2.90, 1990: 1.65, 1995: 1.45, 1999: 1.06},
    'FRF': {1980: 4.20, 1990: 5.40, 1995: 5.05, 2000: 6.56},
    'ITL': {1980: 880.0, 1990: 1200.0, 1995: 1600.0},
    'ESP': {1980: 75.0, 1990: 100.0},
    'NLG': {1977: 2.35, 1990: 1.82},
    'IEP': {1990: 0.77},
    'FIM': {1980: 3.86, 1990: 3.91, 2000: 5.95},
    'DKK': {1990: 6.4, 2000: 7.0},
    'SEK': {1980: 4.23, 1990: 6.0, 2000: 9.0},
    'NOK': {1980: 5.0, 1990: 6.3, 2000: 8.0},
    '¥': {1980: 225, 1990: 145, 2000: 108, 2010: 88, 2020: 106},
    'CN¥': {1980: 1.5, 1990: 4.7, 2000: 8.3, 2010: 6.8, 2020: 6.9},
    '₹': {1980: 8.0, 1990: 17.5, 2000: 45.0, 2010: 45.7, 2020: 74.0},
}

def convert(row):
    val = str(row['budget'])
    year = int(row['year']) if not pd.isna(row['year']) else None
    if pd.isna(val) or year is None:
        return None


    for cur in exc.keys():
        if cur in val:
            n = re.sub(r"[^\d.]", "", val)
            if not n:
                return None
            n = float(n)


            rates = exc[cur]
            near = min(rates.keys(), key=lambda y: abs(y - year))
            rate = rates[near]

            return n / rate

    n = re.sub(r"[^\d.]", "", val)
    return float(n) if n else None

data['budget'] = data.apply(convert, axis=1)


In [None]:
def clean_money(value):
    if pd.isna(value):
        return value
    value = str(value)
    cleaned = re.sub(r'[\$,]', '', value)
    try:
        return float(cleaned)
    except ValueError:
        return value

for col in ['budget', 'revenue']:
    if col in data.columns:
        data[col] = data[col].apply(clean_money)

In [None]:
data.to_csv("итоговый_датасет.csv", index=False)


In [None]:
import pandas as pd
import numpy as np
from google.colab import files
import io

# Загрузка файла с компьютера
uploaded = files.upload()

# Получение имени файла
filename = list(uploaded.keys())[0]
print(f"Загружен файл: {filename}")

# Чтение файла в pandas DataFrame
data = pd.read_csv(io.BytesIO(uploaded[filename]))

# Показать первые строки
print(df.head())