In [1]:
import pandas as pd
import numpy as np
import re

# Обзор и предобработка данных

In [2]:
dataset = pd.read_csv('movies.csv')

In [3]:
dataset.head(n=2)

Unnamed: 0,poster,title,certificate,runtime,genre,rating,about,director,stars,votes,gross_earn
0,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Shawshank Redemption,15,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,Frank Darabont,"('Tim Robbins',), ('Morgan Freeman',), ('Bob G...",2626905,$28.34M
1,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Dark Knight,12A,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,"('Christian Bale',), ('Heath Ledger',), ('Aaro...",2598173,$534.86M


In [4]:
dataset.shape

(5000, 11)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   poster       5000 non-null   object 
 1   title        5000 non-null   object 
 2   certificate  5000 non-null   object 
 3   runtime      5000 non-null   object 
 4   genre        4970 non-null   object 
 5   rating       5000 non-null   float64
 6   about        4986 non-null   object 
 7   director     5000 non-null   object 
 8   stars        5000 non-null   object 
 9   votes        5000 non-null   object 
 10  gross_earn   4585 non-null   object 
dtypes: float64(1), object(10)
memory usage: 429.8+ KB


In [6]:
dataset['title'].nunique(), "уникальных фильмов из", len(dataset)

(4891, 'уникальных фильмов из', 5000)

In [7]:
dataset.describe(include='all')

Unnamed: 0,poster,title,certificate,runtime,genre,rating,about,director,stars,votes,gross_earn
count,5000,5000,5000.0,5000,4970,5000.0,4986,5000,5000,5000.0,4585
unique,1,4891,39.0,178,346,,4976,1969,4969,4918.0,3372
top,https://m.media-amazon.com/images/S/sash/4Fyxw...,Godzilla,15.0,97 min,"Comedy, Drama, Romance",,The story of,Woody Allen,"('William Shatner',), ('Leonard Nimoy',), ('De...",35423.0,$0.01M
freq,5000,3,1843.0,139,242,,8,38,6,2.0,27
mean,,,,,,6.6923,,,,,
std,,,,,,0.973649,,,,,
min,,,,,,1.5,,,,,
25%,,,,,,6.1,,,,,
50%,,,,,,6.8,,,,,
75%,,,,,,7.4,,,,,


In [8]:
duplicates = dataset.duplicated()
print(f"Количество дубликатов: {duplicates.sum()}")


Количество дубликатов: 0


In [9]:
# Проверка пропусков
missing_data = dataset.isnull().sum()
print(f"Пропущенные значения:\n{missing_data}")

Пропущенные значения:
poster           0
title            0
certificate      0
runtime          0
genre           30
rating           0
about           14
director         0
stars            0
votes            0
gross_earn     415
dtype: int64


In [10]:
dataset.isna().sum()

poster           0
title            0
certificate      0
runtime          0
genre           30
rating           0
about           14
director         0
stars            0
votes            0
gross_earn     415
dtype: int64

# Вывод
1. Размеры датасета:
    - Датасет содержит 5000 записей и 11 признаков;
    - Всего в датасете представлено 5000 записей, из которых 4891 уникальных;
2. Типы данных:
    - Основные данные представлены в виде строковых (object) и числовых (float64) типов.
    - Некоторые столбцы требуют приведения типов:
        - runtime (время) — преобразовать в целочисленный формат;
        - votes (голоса) — преобразовать из строки с запятыми в числовой формат;
        - gross_earn (доход) — содержит символы $ и запятые, необходимо привести к числовому формату.
3. Дубликаты:
    - Полных дубликатов нет;
    - Выявлено 109 дубликатов, которые составляют 2,18% от общего числа записей.
4. Пропущенные значения:
    - Пропущенные значения в столбцах:
        - genre (жанр) - 30 пропущенных значений (0,6%);
        - about - 14 пропущенных значений (0,28%);
        - gross_earn - 415 пропущенных значений (8,3%)
    - Пропуски в `genre` и `about` можно заполнить строковым значением 'Unknown' или удалить, если их доля мала.
    - Для gross_earn необходимо заполнить или удалить строки.
5. Возможные проблемы при анализе:
    - Неправильные типы данных (например, votes и gross_earn) могут привести к ошибкам в расчетах.
    - Пропущенные значения в gross_earn могут исказить результаты анализа доходов фильмов.
    - Некоторые строковые признаки, такие как genre, требуют дополнительной обработки для анализа (например, разбиение на подкатегории).
    - Малая доля пропусков в about и genre вряд ли критична, но требует внимания.

# Преобразование типов колонок

In [11]:
df_copy = dataset.copy()

In [12]:
df_copy.columns

Index(['poster', 'title', 'certificate', 'runtime', 'genre', 'rating', 'about',
       'director', 'stars', 'votes', 'gross_earn'],
      dtype='object')

## Обработка колонки `runtime`

In [13]:
df_copy['runtime']

0              142 min
1              152 min
2              148 min
3              139 min
4              154 min
             ...      
4995           102 min
4996            94 min
4997    Drama, Romance
4998            92 min
4999           120 min
Name: runtime, Length: 5000, dtype: object

In [14]:
df_copy['runtime'].unique().sum()

'142 min152 min148 min139 min154 min178 min136 min201 min175 min164 min179 min127 min143 min155 min140 min165 min169 min121 min118 min162 min153 min151 min195 min113 min130 min124 min122 min194 min132 min202 min138 min119 min189 min180 min137 min106 min111 min116 min145 min110 min98 min107 min126 min100 min131 min133 min103 min108 min96 min99 min115 min88 min120 min81 min135 min146 min117 min104 min92 min91 min102 min149 min170 min112 min101 min141 min144 min114 min150 min90 min161 min128 min93 min129 min166 min147 min134 min109 min105 min125 min156 min123 min95 min157 min163 min158 min89 min187 min167 min84 min94 min85 min172 min160 min168 min86 min97 min87 min183 min188 min207 min159 min76 min229 min238 min83 min218 min77 min181 min191 min80 min212 min78 min79 min82 min74 min185 min75 min197 min69 min64 min224 min68 min210 min177 min193 min67 min174 min70 min220 min321 minComedy186 minAction, Crime, Mystery198 min58 minComedy, Drama, RomanceComedy, Romance192 min242 minHorror204 minC

In [15]:
import re

In [16]:
def clear_runtime(runtime):
    match = re.search(r'\d+', str(runtime))
    return int(match.group()) if match else ''

In [17]:
df_copy['runtime_clear'] = df_copy['runtime'].apply(clear_runtime)

In [18]:
df_copy['runtime_clear']

0       142
1       152
2       148
3       139
4       154
       ... 
4995    102
4996     94
4997       
4998     92
4999    120
Name: runtime_clear, Length: 5000, dtype: object

In [19]:
df_copy['runtime_clear'][4997]

''

In [20]:
type(df_copy['runtime_clear'][4997])

str

In [21]:
# Фильтрация датасета, исключающая строки с пустыми значениями в runtime_clear
df_copy = df_copy[df_copy['runtime_clear'] != '']

In [22]:
# Приведение признака runtime_clear к формату int
df_copy['runtime_clear'] = df_copy['runtime_clear'].astype(int)

In [23]:
df_copy.isna().sum()

poster             0
title              0
certificate        0
runtime            0
genre              0
rating             0
about             14
director           0
stars              0
votes              0
gross_earn       396
runtime_clear      0
dtype: int64

In [24]:
df_copy.shape

(4970, 12)

In [25]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4970 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   poster         4970 non-null   object 
 1   title          4970 non-null   object 
 2   certificate    4970 non-null   object 
 3   runtime        4970 non-null   object 
 4   genre          4970 non-null   object 
 5   rating         4970 non-null   float64
 6   about          4956 non-null   object 
 7   director       4970 non-null   object 
 8   stars          4970 non-null   object 
 9   votes          4970 non-null   object 
 10  gross_earn     4574 non-null   object 
 11  runtime_clear  4970 non-null   int64  
dtypes: float64(1), int64(1), object(10)
memory usage: 504.8+ KB


## Обработка колонки `gross_earn`

In [26]:
df_copy['gross_earn']

0        $28.34M
1       $534.86M
2       $292.58M
3        $37.03M
4       $107.93M
          ...   
4994         NaN
4995      $9.40M
4996         NaN
4998      $6.67M
4999      $8.20M
Name: gross_earn, Length: 4970, dtype: object

In [27]:
df_copy['gross_earn'].unique()

array(['$28.34M', '$534.86M', '$292.58M', ..., '$6.24M', '$23.00M',
       '$8.20M'], shape=(3367,), dtype=object)

In [28]:
# Очистка от символов $ и запятых с обработкой пропущенных значений
def clear_gross_earn(gross_earn):
    if pd.isna(gross_earn):
        return np.nan
    else:
        return re.sub(r'[^\d\.]', '', gross_earn)


In [29]:
df_copy['gross_earn_clear'] = df_copy['gross_earn'].apply(clear_gross_earn)

In [30]:
df_copy['gross_earn_clear']

0        28.34
1       534.86
2       292.58
3        37.03
4       107.93
         ...  
4994       NaN
4995      9.40
4996       NaN
4998      6.67
4999      8.20
Name: gross_earn_clear, Length: 4970, dtype: object

In [31]:
df_copy['gross_earn_clear'] = df_copy['gross_earn_clear'].astype(float)

In [33]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4970 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   poster            4970 non-null   object 
 1   title             4970 non-null   object 
 2   certificate       4970 non-null   object 
 3   runtime           4970 non-null   object 
 4   genre             4970 non-null   object 
 5   rating            4970 non-null   float64
 6   about             4956 non-null   object 
 7   director          4970 non-null   object 
 8   stars             4970 non-null   object 
 9   votes             4970 non-null   object 
 10  gross_earn        4574 non-null   object 
 11  runtime_clear     4970 non-null   int64  
 12  gross_earn_clear  4574 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 543.6+ KB


# Сортировка по рейтингу

In [34]:
sorted_dataset = df_copy.sort_values(by='rating', ascending=False)

In [35]:
sorted_dataset.head()

Unnamed: 0,poster,title,certificate,runtime,genre,rating,about,director,stars,votes,gross_earn,runtime_clear,gross_earn_clear
0,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Shawshank Redemption,15,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,Frank Darabont,"('Tim Robbins',), ('Morgan Freeman',), ('Bob G...",2626905,$28.34M,142,28.34
9,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Godfather,X,175 min,"Crime, Drama",9.2,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,"('Marlon Brando',), ('Al Pacino',), ('James Ca...",1819641,$134.97M,175,134.97
3788,https://m.media-amazon.com/images/S/sash/4Fyxw...,Hababam Sinifi,12A,87 min,"Comedy, Drama",9.2,"Lazy, uneducated students share a very close b...",Ertem Egilmez,"('Kemal Sunal',), ('Münir Özkul',), ('Halit Ak...",40492,,87,
8,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Lord of the Rings: The Return of the King,12A,201 min,"Action, Adventure, Drama",9.0,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,"('Elijah Wood',), ('Viggo Mortensen',), ('Ian ...",1801662,$377.85M,201,377.85
24,https://m.media-amazon.com/images/S/sash/4Fyxw...,Schindler's List,15,195 min,"Biography, Drama, History",9.0,"In German-occupied Poland during World War II,...",Steven Spielberg,"('Liam Neeson',), ('Ralph Fiennes',), ('Ben Ki...",1333407,$96.90M,195,96.9


In [41]:
# Переопределение индексов
sorted_dataset.reset_index(drop=True, inplace=True)
sorted_dataset.head()

Unnamed: 0,poster,title,certificate,runtime,genre,rating,about,director,stars,votes,gross_earn,runtime_clear,gross_earn_clear
0,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Shawshank Redemption,15,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,Frank Darabont,"('Tim Robbins',), ('Morgan Freeman',), ('Bob G...",2626905,$28.34M,142,28.34
1,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Godfather,X,175 min,"Crime, Drama",9.2,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,"('Marlon Brando',), ('Al Pacino',), ('James Ca...",1819641,$134.97M,175,134.97
2,https://m.media-amazon.com/images/S/sash/4Fyxw...,Hababam Sinifi,12A,87 min,"Comedy, Drama",9.2,"Lazy, uneducated students share a very close b...",Ertem Egilmez,"('Kemal Sunal',), ('Münir Özkul',), ('Halit Ak...",40492,,87,
3,https://m.media-amazon.com/images/S/sash/4Fyxw...,The Lord of the Rings: The Return of the King,12A,201 min,"Action, Adventure, Drama",9.0,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,"('Elijah Wood',), ('Viggo Mortensen',), ('Ian ...",1801662,$377.85M,201,377.85
4,https://m.media-amazon.com/images/S/sash/4Fyxw...,Schindler's List,15,195 min,"Biography, Drama, History",9.0,"In German-occupied Poland during World War II,...",Steven Spielberg,"('Liam Neeson',), ('Ralph Fiennes',), ('Ben Ki...",1333407,$96.90M,195,96.9


In [40]:
sorted_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970 entries, 0 to 4969
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   poster            4970 non-null   object 
 1   title             4970 non-null   object 
 2   certificate       4970 non-null   object 
 3   runtime           4970 non-null   object 
 4   genre             4970 non-null   object 
 5   rating            4970 non-null   float64
 6   about             4956 non-null   object 
 7   director          4970 non-null   object 
 8   stars             4970 non-null   object 
 9   votes             4970 non-null   object 
 10  gross_earn        4574 non-null   object 
 11  runtime_clear     4970 non-null   int64  
 12  gross_earn_clear  4574 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 504.9+ KB


In [42]:
sorted_dataset.to_csv('movies_clean.csv', index=False)