In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
imdb_top_1000 = pd.read_csv('../data/imdb_top_1000.csv')
movies_data = pd.read_csv('../data/movies_data.csv')

# Display the first few rows of each dataset
imdb_top_1000.head(), movies_data.head()

(                                         Poster_Link  \
 0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
 1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
 2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
 3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
 4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   
 
                Series_Title Released_Year Certificate  Runtime  \
 0  The Shawshank Redemption          1994           A  142 min   
 1             The Godfather          1972           A  175 min   
 2           The Dark Knight          2008          UA  152 min   
 3    The Godfather: Part II          1974           A  202 min   
 4              12 Angry Men          1957           U   96 min   
 
                   Genre  IMDB_Rating  \
 0                 Drama          9.3   
 1          Crime, Drama          9.2   
 2  Action, Crime, Drama          9.0   
 3          Crime, Drama          9.0   
 4          Crime, Drama          9.0   
 
    

In [4]:
# Check for missing values and data types
imdb_top_1000.info(), movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB
<class 'pandas.core.frame.DataFrame'>

(None, None)

In [5]:
# Convert 'Released_Year' to integer
imdb_top_1000['Released_Year'] = imdb_top_1000['Released_Year'].str.extract('(\d+)').astype(float)
movies_data['Released_Year'] = movies_data['Released_Year'].str.extract('(\d+)').astype(float)

# Convert 'Runtime' to integer
imdb_top_1000['Runtime'] = imdb_top_1000['Runtime'].str.extract('(\d+)').astype(float)

# Convert 'Gross' to numeric, removing commas
imdb_top_1000['Gross'] = imdb_top_1000['Gross'].str.replace(',', '').astype(float)
movies_data['Gross'] = movies_data['Gross'].astype(float)

# Fill missing values in 'Meta_score' with the mean
imdb_top_1000['Meta_score'].fillna(imdb_top_1000['Meta_score'].mean(), inplace=True)
movies_data['Meta_score'].fillna(movies_data['Meta_score'].mean(), inplace=True)

# Check the cleaned data
imdb_top_1000.head(), movies_data.head()

(                                         Poster_Link  \
 0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
 1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
 2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
 3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
 4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   
 
                Series_Title  Released_Year Certificate  Runtime  \
 0  The Shawshank Redemption         1994.0           A    142.0   
 1             The Godfather         1972.0           A    175.0   
 2           The Dark Knight         2008.0          UA    152.0   
 3    The Godfather: Part II         1974.0           A    202.0   
 4              12 Angry Men         1957.0           U     96.0   
 
                   Genre  IMDB_Rating  \
 0                 Drama          9.3   
 1          Crime, Drama          9.2   
 2  Action, Crime, Drama          9.0   
 3          Crime, Drama          9.0   
 4          Crime, Drama          9.0   


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Prepare the data
features = ['Released_Year', 'Runtime', 'Meta_score', 'No_of_Votes']
X = imdb_top_1000[features].fillna(0)
y = imdb_top_1000['IMDB_Rating']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

np.float64(0.21005631387796958)

In [8]:
# save model

import pickle
import pathlib

model_path = pathlib.Path('../model/v1.pkl')
model_path.parent.mkdir(exist_ok=True, parents=True)

with model_path.open('wb')as f:
    pickle.dump(model, f)