# Разведочный анализ данных IMDb

## Предполагается исследовать рейтинги, бюджет и доходы в эквиваленте usd

### Импорт библиотек, загрузка данных

In [49]:
import kagglehub as kh
import pandas as pd

### В работе будет использован объединенный набор данных из открытых источников (ресурс kaggle)

<a href="https://www.kaggle.com/datasets/hetbabariya/imdb-movies-data-collection-5000-records">IMDb Movies Data
Collection (5000 Records)</a>

Описание данных датасета:
1. Title - Наименование фильма;
2. Average Rating - Усредненный рейтинг IMDb;
3. Director - Режиссер;
4. Writer - Сценарист;
5. Metascore - Оценки из <a href="https://www.metacritic.com/">Metacritic</a>;
6. Cast - Ключевые актеры;
7. Release Date - Дата выхода.
8. Country of Origin - Страна происхождения;
9. Languages - Используемые языки.
10. Budget - Бюджет;
11. Worldwide Gross - Кассовые сборы;
12. Runtime - Длительность.

<a href="https://www.kaggle.com/datasets/gayu14/tv-and-movie-metadata-with-genres-and-ratings-imbd">TV & Movie Metadata
with Genres and Ratings (2023) 130000 Records)</a>

Описание данных датасета:
1. movie - Наименование фильма;
2. genre - Жанр;
3. runtime - Длительность;
4. certificate - Возрастное ограничение;
5. rating - Усредненный рейтинг;
6. stars - Ключевые актеры;
7. description - Описание.
8. votes - Голоса или рейтинг среди оценщиков;
9. director - Режиссер.

## Объединенный датасет будет содержать следующие поля:


In [50]:
# (~5000 rows)
# Download latest version
path_5 = kh.dataset_download("hetbabariya/imdb-movies-data-collection-5000-records")
print("Path to dataset files:", path_5)
ds_5 = pd.read_csv(path_5 + '/IMDB_Movies_Dataset.csv')

Path to dataset files: /home/kvs/.cache/kagglehub/datasets/hetbabariya/imdb-movies-data-collection-5000-records/versions/1


In [51]:
# (~130000 rows)
# Download latest version
path_130 = kh.dataset_download("gayu14/tv-and-movie-metadata-with-genres-and-ratings-imbd")
print("Path to dataset files:", path_130)
ds_130 = pd.read_csv(path_130 + '/IMBD.csv')

Path to dataset files: /home/kvs/.cache/kagglehub/datasets/gayu14/tv-and-movie-metadata-with-genres-and-ratings-imbd/versions/1


In [53]:
ds_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4989 entries, 0 to 4988
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4989 non-null   int64  
 1   Title              4989 non-null   object 
 2   Average Rating     4989 non-null   float64
 3   Director           4989 non-null   object 
 4   Writer             4988 non-null   object 
 5   Metascore          3055 non-null   float64
 6   Cast               4982 non-null   object 
 7   Release Date       4989 non-null   object 
 8   Country of Origin  4986 non-null   object 
 9   Languages          4968 non-null   object 
 10  Budget             2651 non-null   object 
 11  Worldwide Gross    3895 non-null   object 
 12  Runtime            4989 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 506.8+ KB


In [52]:
ds_130.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129891 entries, 0 to 129890
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   movie        129891 non-null  object 
 1   genre        129891 non-null  object 
 2   runtime      109005 non-null  object 
 3   certificate  23850 non-null   object 
 4   rating       114381 non-null  float64
 5   stars        124676 non-null  object 
 6   description  129891 non-null  object 
 7   votes        114393 non-null  object 
 8   director     88611 non-null   object 
dtypes: float64(1), object(8)
memory usage: 8.9+ MB


In [54]:
ds_130.director

0                               NaN
1         ['Christopher McQuarrie']
2          ['Alejandro Monteverde']
3                               NaN
4                               NaN
                    ...            
129886          ['Diego Al Romero']
129887          ['Harry Joe Brown']
129888               ['Tom Forman']
129889              ['Al Christie']
129890        ['Milton J. Fahrney']
Name: director, Length: 129891, dtype: object

In [55]:
ds_130['director'] = ds_130['director'].replace("'", "", regex=True)
ds_130['director'] = ds_130['director'].replace("\\[", "", regex=True)
ds_130['director'] = ds_130['director'].replace("\\]", "", regex=True)
ds_130.director

0                           NaN
1         Christopher McQuarrie
2          Alejandro Monteverde
3                           NaN
4                           NaN
                  ...          
129886          Diego Al Romero
129887          Harry Joe Brown
129888               Tom Forman
129889              Al Christie
129890        Milton J. Fahrney
Name: director, Length: 129891, dtype: object

In [56]:
ds_inner = ds_5.merge(ds_130, left_on=['Title', 'Director'], right_on=['movie', 'director'], how='inner')

In [57]:
ds_inner.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3547 entries, 0 to 3546
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         3547 non-null   int64  
 1   Title              3547 non-null   object 
 2   Average Rating     3547 non-null   float64
 3   Director           3547 non-null   object 
 4   Writer             3546 non-null   object 
 5   Metascore          2335 non-null   float64
 6   Cast               3545 non-null   object 
 7   Release Date       3547 non-null   object 
 8   Country of Origin  3545 non-null   object 
 9   Languages          3535 non-null   object 
 10  Budget             2079 non-null   object 
 11  Worldwide Gross    2799 non-null   object 
 12  Runtime            3547 non-null   object 
 13  movie              3547 non-null   object 
 14  genre              3547 non-null   object 
 15  runtime            3523 non-null   object 
 16  certificate        2242 

In [28]:
ds_inner.columns

Index(['Title', 'Average Rating', 'Director', 'Writer', 'Metascore', 'Cast',
       'Release Date', 'Country of Origin', 'Languages', 'Budget',
       'Worldwide Gross', 'Runtime', 'movie', 'genre', 'runtime',
       'certificate', 'rating', 'stars', 'description', 'votes', 'director'],
      dtype='object')

In [62]:
ds_inner.dropna(axis=0)
ds_inner.reset_index(drop=True, inplace=True)
ds_inner = ds_inner.drop(columns=['Unnamed: 0'])

KeyError: "['Unnamed: 0'] not found in axis"

In [63]:
ds = ds_inner.where(ds_inner.Budget.str.contains('$') == True)

In [64]:
ds.shape

(3547, 21)

In [65]:
ds.to_csv('~/.cache/kagglehub/datasets/ds.csv')