In [13]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [22]:

# Замените 'movies.csv' на путь к вашему датасету
df = pd.read_csv('movies.csv')


# Оставляем только нужные колонки
df = df[['genre', 'director', 'star', 'budget', 'runtime', 'score']]
df.head()

Unnamed: 0,genre,director,star,budget,runtime,score
0,Drama,Stanley Kubrick,Jack Nicholson,19000000.0,146.0,8.4
1,Adventure,Randal Kleiser,Brooke Shields,4500000.0,104.0,5.8
2,Action,Irvin Kershner,Mark Hamill,18000000.0,124.0,8.7
3,Comedy,Jim Abrahams,Robert Hays,3500000.0,88.0,7.7
4,Comedy,Harold Ramis,Chevy Chase,6000000.0,98.0,7.3


In [15]:
# смотрим количество пропусков
df.isnull().sum()

Unnamed: 0,0
genre,0
director,0
star,1
budget,2171
runtime,4
score,3


In [16]:
# Заменим пропуски на медиану и удалим другие пропуски
df["budget"].fillna(df["budget"].median(), inplace=True)
df = df.dropna()
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["budget"].fillna(df["budget"].median(), inplace=True)


Unnamed: 0,0
genre,0
director,0
star,0
budget,0
runtime,0
score,0


In [23]:
# Определим топ-15 актеров и режиссеров
N = 15
top_directors = df['director'].value_counts().nlargest(N).index.tolist()
top_actors = df['star'].value_counts().nlargest(N).index.tolist()
print(top_directors)
print(top_actors)
# Заменим редких на 'other'
df['director'] = df['director'].apply(lambda x: x if x in top_directors else 'other')
df['star'] = df['star'].apply(lambda x: x if x in top_actors else 'other')
df.head()

['Woody Allen', 'Clint Eastwood', 'Directors', 'Steven Spielberg', 'Ron Howard', 'Ridley Scott', 'Steven Soderbergh', 'Joel Schumacher', 'Barry Levinson', 'Martin Scorsese', 'Tim Burton', 'Garry Marshall', 'Oliver Stone', 'Richard Linklater', 'Spike Lee']
['Nicolas Cage', 'Robert De Niro', 'Tom Hanks', 'Denzel Washington', 'Bruce Willis', 'Tom Cruise', 'Johnny Depp', 'Sylvester Stallone', 'John Travolta', 'Mel Gibson', 'Kevin Costner', 'Steve Martin', 'Adam Sandler', 'Jeff Bridges', 'Eddie Murphy']


Unnamed: 0,genre,director,star,budget,runtime,score
0,Drama,other,other,19000000.0,146.0,8.4
1,Adventure,other,other,4500000.0,104.0,5.8
2,Action,other,other,18000000.0,124.0,8.7
3,Comedy,other,other,3500000.0,88.0,7.7
4,Comedy,other,other,6000000.0,98.0,7.3


In [18]:

# Логарифмируем бюджет (с добавлением 1 чтобы избежать log(0))
df['budget'] = np.log1p(df['budget'])

# Определим признаки и целевую переменную
features = ['genre', 'director', 'star', 'budget', 'runtime']
X = df[features]
y = df['score']


# Убедимся, что нет NaN
df = df.dropna(subset=['budget', 'runtime', 'genre', 'director', 'star', 'score'])

In [19]:

# Категориальные и числовые признаки
categorical = ['genre', 'director', 'star']
numerical = ['budget', 'runtime']

# Препроцессинг
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
])

# Модель
model = Pipeline([
    ('preprocessing', preprocessor),
    ('regression', LinearRegression())
])

# Разделим выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучим модель
model.fit(X_train, y_train)

# Предсказания и оценка
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.4f}")
print("RMSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


MAE: 0.6539


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Категориальные и числовые признаки
categorical = ['genre', 'director', 'star']
numerical = ['budget', 'runtime']

# Препроцессинг
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
])

# Разделим выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Список моделей для сравнения
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regression', SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1))
]

# Обучим и оценим каждую модель
results = []
for name, model in models:
    # Создаем pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regression', model)
    ])

    # Обучение модели
    pipeline.fit(X_train, y_train)

    # Предсказания
    y_pred = pipeline.predict(X_test)

    # Оценка качества
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Сохраняем результаты
    results.append({
        'Model': name,
        'MAE': mae,
        'RMSE': rmse,
        'R²': r2
    })

# Создаем DataFrame с результатами
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='RMSE'))

                       Model       MAE      RMSE        R²
0          Linear Regression  0.653887  0.853067  0.263021
2  Support Vector Regression  0.667426  0.875242  0.224207
1              Random Forest  0.691524  0.906501  0.167804


Берем Линейную регрессию как лучшую модель

In [10]:
from joblib import dump

# Сохранение модели
dump(model, 'moveies_model.pkl')  # или 'model.pkl'

print("Модель сохранена как 'movies_model.pkl'")

Модель сохранена как 'movies_model.pkl'


In [11]:
# Обучение скаллера и one_hot_encoder`а
import joblib

# Загрузка данных
df = pd.read_csv('movies.csv')
df = df[['genre', 'director', 'star', 'budget', 'runtime', 'score']]

# Предобработка данных (как в вашем ноутбуке)
df["budget"].fillna(df["budget"].median(), inplace=True)
df = df.dropna()

# Логарифмирование бюджета
df['budget'] = np.log1p(df['budget'])

# Обработка категориальных признаков
top_directors = ['Woody Allen', 'Clint Eastwood', 'Steven Spielberg', 'Joel Schumacher',
                'Ron Howard', 'Barry Levinson', 'Steven Soderbergh', 'Sidney Lumet',
                'Ridley Scott', 'Oliver Stone', 'Tony Scott', 'Wes Craven',
                'Garry Marshall', 'Martin Scorsese']
top_actors = ['Nicolas Cage', 'Robert De Niro', 'Tom Hanks', 'Bruce Willis',
             'John Travolta', 'Denzel Washington', 'Steve Martin', 'Mel Gibson',
             'Sylvester Stallone', 'Robin Williams', 'Tom Cruise', 'Johnny Depp',
             'Eddie Murphy', 'Jeff Bridges', 'Arnold Schwarzenegger']

df['director'] = df['director'].apply(lambda x: x if x in top_directors else 'other')
df['star'] = df['star'].apply(lambda x: x if x in top_actors else 'other')

# Определение категориальных и числовых признаков
categorical = ['genre', 'director', 'star']
numerical = ['budget', 'runtime']

# Создание и обучение OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(df[categorical])

# Создание и обучение StandardScaler
scaler = StandardScaler()
scaler.fit(df[numerical])

# Сохранение кодировщиков
joblib.dump(ohe, 'one_hot_encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')

print("Кодировщики успешно созданы и сохранены!")

Кодировщики успешно созданы и сохранены!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["budget"].fillna(df["budget"].median(), inplace=True)
