## Data Loading

In [1]:
# Import nessecarry dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Filter the warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set the data path

DATA_PATH = "./data/movies_data.csv"

In [3]:
# Read the dataset

data = pd.read_csv(DATA_PATH, encoding="latin-1")
data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
# No. of rows and columns

data.shape

(15509, 10)

In [5]:
# Some information about columns

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [6]:
# No of null values in each column

data.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [7]:
# Calculate and display the percentage of missing values in each column of the dataset.

null_counts = data.isnull().sum()
total_counts = len(data)
null_percentage = (null_counts / total_counts) * 100

for column, count, percentage in zip(null_counts.index, null_counts.values, null_percentage.values):
    print(f'{column}: {percentage:.2f}%')

Name: 0.00%
Year: 3.40%
Duration: 53.32%
Genre: 12.10%
Rating: 48.94%
Votes: 48.93%
Director: 3.39%
Actor 1: 10.43%
Actor 2: 15.37%
Actor 3: 20.27%


In [8]:
data.duplicated().sum()

6

## Data Cleaning

Let's first drop duplicates.

In [9]:
data.drop_duplicates(inplace=True)

In [10]:
data.duplicated().sum()

0

Now let's analyze and clean each vairable one-by-one.

In [11]:
data.Name

0                                          
1        #Gadhvi (He thought he was Gandhi)
2                               #Homecoming
3                                   #Yaaram
4                         ...And Once Again
                        ...                
15504                   Zulm Ko Jala Doonga
15505                                 Zulmi
15506                             Zulmi Raj
15507                         Zulmi Shikari
15508                          Zulm-O-Sitam
Name: Name, Length: 15503, dtype: object

In [12]:
len(data.Name.unique())

13838

It's unique for each movie, so we'll drop it.

In [13]:
cleaned_data = data.drop("Name", axis=1)
cleaned_data.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [14]:
cleaned_data.Duration

0            NaN
1        109 min
2         90 min
3        110 min
4        105 min
          ...   
15504        NaN
15505    129 min
15506        NaN
15507        NaN
15508    130 min
Name: Duration, Length: 15503, dtype: object

In [15]:
# Extract the numeric part and convert to int

cleaned_data['Duration_Min'] = cleaned_data['Duration'].str.extract('(\d+)').astype(float)

In [16]:
cleaned_data.Duration_Min

0          NaN
1        109.0
2         90.0
3        110.0
4        105.0
         ...  
15504      NaN
15505    129.0
15506      NaN
15507      NaN
15508    130.0
Name: Duration_Min, Length: 15503, dtype: float64

Let's fill the null value in the `Duration_Min` column with the average value.

In [17]:
cleaned_data.Duration_Min.mean()

128.1279182207487

In [18]:
cleaned_data.Duration_Min.fillna(np.round(cleaned_data.Duration_Min.mean()), inplace=True)

In [19]:
cleaned_data.drop("Duration", axis=1, inplace=True)

In [20]:
cleaned_data.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_Min
0,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia,128.0
1,(2019),Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,109.0
2,(2021),"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana,90.0
3,(2019),"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,110.0
4,(2010),Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali,105.0


In [21]:
data.Year

0           NaN
1        (2019)
2        (2021)
3        (2019)
4        (2010)
          ...  
15504    (1988)
15505    (1999)
15506    (2005)
15507    (1988)
15508    (1998)
Name: Year, Length: 15503, dtype: object

Since the `Year` column has only 3% missing values, we can drop them.

In [22]:
cleaned_data.dropna(subset=["Year"], inplace=True)

In [23]:
# Remove parentheses and convert to integer
cleaned_data['Year'] = cleaned_data['Year'].str.replace('(', '').str.replace(')', '').astype(int)

In [24]:
cleaned_data.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_Min
1,2019,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,109.0
2,2021,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana,90.0
3,2019,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,110.0
4,2010,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali,105.0
5,1997,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0


In [25]:
cleaned_data.Year.unique()

array([2019, 2021, 2010, 1997, 2005, 2008, 2012, 2014, 2004, 2016, 1991,
       1990, 2018, 1987, 1948, 1958, 2017, 2020, 2009, 2002, 1993, 1946,
       1994, 2007, 2013, 2003, 1998, 1979, 1951, 1956, 1974, 2015, 2006,
       1981, 1985, 2011, 2001, 1967, 1988, 1995, 1959, 1996, 1970, 1976,
       2000, 1999, 1973, 1968, 1943, 1953, 1986, 1983, 1989, 1982, 1977,
       1957, 1950, 1992, 1969, 1975, 1947, 1972, 1971, 1935, 1978, 1960,
       1944, 1963, 1940, 1984, 1934, 1955, 1936, 1980, 1966, 1949, 1962,
       1964, 1952, 1933, 1942, 1939, 1954, 1945, 1961, 1965, 1938, 1941,
       1931, 1937, 2022, 1932, 1923, 1915, 1928, 1922, 1917, 1913, 1930,
       1926, 1914, 1924])

In [26]:
cleaned_data.Genre.value_counts()

Drama                         2646
Action                        1264
Thriller                       732
Romance                        689
Drama, Romance                 517
                              ... 
Action, Musical, War             1
Action, Fantasy, Horror          1
Horror, Crime, Thriller          1
Crime, Horror, Thriller          1
Adventure, Fantasy, Sci-Fi       1
Name: Genre, Length: 474, dtype: int64

In [27]:
# Split the 'Genre' column into multiple columns
cleaned_data[['First_Genre', 'Second_Genre', 'Third_Genre']] = cleaned_data['Genre'].str.split(', ', expand=True)

In [28]:
cleaned_data.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,2019,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,109.0,Drama,,
2,2021,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana,90.0,Drama,Musical,
3,2019,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,110.0,Comedy,Romance,
4,2010,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali,105.0,Drama,,
5,1997,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0,Comedy,Drama,Musical


In [29]:
cleaned_data.First_Genre.value_counts()

Drama          4356
Action         3424
Comedy         1494
Romance         741
Thriller        739
Crime           434
Horror          373
Documentary     370
Adventure       239
Fantasy         189
Musical         161
Family          152
Biography       141
Mystery         138
Animation       119
History          26
Music            15
Sport             9
Sci-Fi            9
War               5
Reality-TV        2
Name: First_Genre, dtype: int64

In [30]:
cleaned_data[['Second_Genre', 'Third_Genre']] = cleaned_data[['Second_Genre', 'Third_Genre']].fillna("Empty")

In [31]:
cleaned_data.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,2019,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,109.0,Drama,Empty,Empty
2,2021,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana,90.0,Drama,Musical,Empty
3,2019,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,110.0,Comedy,Romance,Empty
4,2010,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali,105.0,Drama,Empty,Empty
5,1997,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0,Comedy,Drama,Musical


In [32]:
cleaned_data.Third_Genre.value_counts()

Empty        11815
Drama          827
Romance        750
Thriller       482
Musical        188
Mystery        170
Family         162
Crime          140
Fantasy        123
History         81
Comedy          57
Sport           36
Music           34
Horror          26
Sci-Fi          26
War             24
Action          15
Adventure       13
News             3
Western          3
Biography        1
Name: Third_Genre, dtype: int64

In [33]:
cleaned_data.First_Genre.isnull().sum()

1840

In [34]:
cleaned_data.Second_Genre.isnull().sum()

0

In [35]:
cleaned_data.Third_Genre.isnull().sum()

0

In [36]:
cleaned_data.dropna(subset=['First_Genre'], inplace=True)

In [37]:

cleaned_data.First_Genre.isnull().sum()

0

In [38]:
cleaned_data.drop(["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"], axis=1, inplace=True)

In [39]:
cleaned_data.head()

Unnamed: 0,Year,Rating,Votes,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,2019,7.0,8.0,109.0,Drama,Empty,Empty
2,2021,,,90.0,Drama,Musical,Empty
3,2019,4.4,35.0,110.0,Comedy,Romance,Empty
4,2010,,,105.0,Drama,Empty,Empty
5,1997,4.7,827.0,147.0,Comedy,Drama,Musical


In [40]:
cleaned_data.Rating.unique()

array([ 7. ,  nan,  4.4,  4.7,  7.4,  5.6,  4. ,  6.2,  5.9,  6.5,  5.7,
        6.3,  7.2,  6.6,  7.3,  7.1,  6.9,  3.5,  5. ,  4.5,  6.4,  4.1,
        4.8,  8.1,  5.5,  6.8,  6.1,  7.7,  5.1,  3.1,  3.3,  7.8,  8.4,
        5.2,  4.3,  5.8,  4.6,  7.5,  6.7,  3.6,  3.9,  4.2,  5.3,  3.4,
        5.4,  3. ,  8. ,  6. ,  3.8,  7.9,  2.7,  4.9,  2.4,  7.6,  3.7,
        3.2,  2.5,  2.8,  2.6,  2.9,  8.2,  8.7,  8.3,  9.3,  8.8,  2.1,
        2.3,  8.5,  8.6,  9. ,  9.6,  1.7,  9.1,  2. ,  1.4,  8.9,  1.9,
        9.4,  9.7,  1.8,  9.2,  1.6, 10. ,  2.2,  1.1])

In [41]:
cleaned_data.isnull().sum()

Year               0
Rating          5319
Votes           5318
Duration_Min       0
First_Genre        0
Second_Genre       0
Third_Genre        0
dtype: int64

In [42]:
cleaned_data.shape

(13136, 7)

In [43]:
print(cleaned_data.Rating.mean())
print(cleaned_data.Rating.median())

5.839567609057182
6.0


In [44]:
cleaned_data.dropna(subset=['Rating', 'Votes'], inplace=True)

In [45]:
cleaned_data.isnull().sum()

Year            0
Rating          0
Votes           0
Duration_Min    0
First_Genre     0
Second_Genre    0
Third_Genre     0
dtype: int64

In [46]:
cleaned_data.shape

(7817, 7)

In [47]:
cleaned_data.head()

Unnamed: 0,Year,Rating,Votes,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,2019,7.0,8,109.0,Drama,Empty,Empty
3,2019,4.4,35,110.0,Comedy,Romance,Empty
5,1997,4.7,827,147.0,Comedy,Drama,Musical
6,2005,7.4,1086,142.0,Drama,Romance,War
8,2012,5.6,326,82.0,Horror,Mystery,Thriller


In [48]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7817 entries, 1 to 15508
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          7817 non-null   int32  
 1   Rating        7817 non-null   float64
 2   Votes         7817 non-null   object 
 3   Duration_Min  7817 non-null   float64
 4   First_Genre   7817 non-null   object 
 5   Second_Genre  7817 non-null   object 
 6   Third_Genre   7817 non-null   object 
dtypes: float64(2), int32(1), object(4)
memory usage: 458.0+ KB


In [49]:
cleaned_data.Votes.dtype

dtype('O')

In [50]:
# Remove commas and convert the 'Votes' column to int
cleaned_data['Votes'] = cleaned_data['Votes'].str.replace(',', '').astype(int)

In [51]:
cleaned_data.duplicated().sum()

7

In [52]:
cleaned_data.drop_duplicates(inplace=True)

## Auto EDA

In [53]:
profile = ProfileReport(cleaned_data, title="IMDb Indian Movies Rating", explorative=True, dark_mode=True, minimal=False)

In [54]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

## Data Preprocessing

In [55]:
cleaned_data.head()

Unnamed: 0,Year,Rating,Votes,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,2019,7.0,8,109.0,Drama,Empty,Empty
3,2019,4.4,35,110.0,Comedy,Romance,Empty
5,1997,4.7,827,147.0,Comedy,Drama,Musical
6,2005,7.4,1086,142.0,Drama,Romance,War
8,2012,5.6,326,82.0,Horror,Mystery,Thriller


In [56]:
cleaned_data.Year.min()

1917

In [57]:
cleaned_data.Year.max()

2021

In [58]:
transformed_data = cleaned_data.copy()

In [59]:
scaler = StandardScaler()
transformed_data['Year'] = scaler.fit_transform(transformed_data[['Year']])

In [60]:
transformed_data.Year

1        1.259664
3        1.259664
5        0.159538
6        0.559584
8        0.909624
           ...   
15501   -0.090490
15503   -0.240508
15504   -0.290513
15505    0.259550
15508    0.209544
Name: Year, Length: 7810, dtype: float64

In [61]:
print(transformed_data.Rating.min())
print(transformed_data.Rating.max())

1.1
10.0


In [62]:
transformed_data['Votes'] = scaler.fit_transform(transformed_data[['Votes']])

In [63]:
transformed_data.Votes

1       -0.167584
3       -0.165272
5       -0.097465
6       -0.075290
8       -0.140358
           ...   
15501   -0.156711
15503   -0.164502
15504   -0.167327
15505   -0.112190
15508   -0.166556
Name: Votes, Length: 7810, dtype: float64

In [64]:
transformed_data.head()

Unnamed: 0,Year,Rating,Votes,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,1.259664,7.0,-0.167584,109.0,Drama,Empty,Empty
3,1.259664,4.4,-0.165272,110.0,Comedy,Romance,Empty
5,0.159538,4.7,-0.097465,147.0,Comedy,Drama,Musical
6,0.559584,7.4,-0.07529,142.0,Drama,Romance,War
8,0.909624,5.6,-0.140358,82.0,Horror,Mystery,Thriller


In [65]:
# Combine all unique genres from the three columns
unique_genres = set(transformed_data['First_Genre'].unique()) | set(transformed_data['Second_Genre'].unique()) | set(transformed_data['Third_Genre'].unique())

# Create a mapping dictionary that includes all unique genres
genre_mapping = {genre: i for i, genre in enumerate(unique_genres)}

# Apply the mapping to all three columns
transformed_data['First_Genre'] = transformed_data['First_Genre'].map(genre_mapping)
transformed_data['Second_Genre'] = transformed_data['Second_Genre'].map(genre_mapping)
transformed_data['Third_Genre'] = transformed_data['Third_Genre'].map(genre_mapping)


In [66]:
transformed_data.head()

Unnamed: 0,Year,Rating,Votes,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,1.259664,7.0,-0.167584,109.0,16,14,14
3,1.259664,4.4,-0.165272,110.0,21,6,14
5,0.159538,4.7,-0.097465,147.0,21,16,13
6,0.559584,7.4,-0.07529,142.0,16,6,9
8,0.909624,5.6,-0.140358,82.0,17,12,20


In [67]:
X = transformed_data.drop("Rating", axis=1)
y = transformed_data["Rating"]

In [68]:
X

Unnamed: 0,Year,Votes,Duration_Min,First_Genre,Second_Genre,Third_Genre
1,1.259664,-0.167584,109.0,16,14,14
3,1.259664,-0.165272,110.0,21,6,14
5,0.159538,-0.097465,147.0,21,16,13
6,0.559584,-0.075290,142.0,16,6,9
8,0.909624,-0.140358,82.0,17,12,20
...,...,...,...,...,...,...
15501,-0.090490,-0.156711,128.0,10,5,16
15503,-0.240508,-0.164502,125.0,10,5,16
15504,-0.290513,-0.167327,128.0,10,14,14
15505,0.259550,-0.112190,129.0,10,16,14


In [69]:
y

1        7.0
3        4.4
5        4.7
6        7.4
8        5.6
        ... 
15501    5.3
15503    5.8
15504    4.6
15505    4.5
15508    6.2
Name: Rating, Length: 7810, dtype: float64

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
# # Initialize the Decision Tree Regressor model
# model = DecisionTreeRegressor(random_state=42)

# # Train the model on the training data
# model.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred = model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
# r2 = r2_score(y_test, y_pred)            # R-squared

# print("Mean Squared Error:", mse)
# print("R-squared:", r2)

In [72]:
# Create a list of regression models to evaluate
models = [
    ("Linear Regression", LinearRegression()),
    ("Ridge Regression", Ridge()),
    ("Lasso Regression", Lasso()),
    ("Decision Tree Regressor", DecisionTreeRegressor()),
    ("Random Forest Regressor", RandomForestRegressor())
]

# Evaluate each model using cross-validation and print the results
for name, model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
    rmse_scores = np.sqrt(-scores)
    print(f"{name}:")
    print(f"Mean RMSE: {rmse_scores.mean()}")
    print(f"Standard Deviation of RMSE: {rmse_scores.std()}")
    print()

# Select the best model based on cross-validation results
best_model_name = min(models, key=lambda x: np.sqrt(-cross_val_score(x[1], X_train, y_train, cv=5, scoring="neg_mean_squared_error")).mean())[0]

# Fit the best model on the training data
best_model = [model[1] for model in models if model[0] == best_model_name][0]
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Best Model: {best_model_name}")
print(f"Test RMSE: {test_rmse}")

Linear Regression:
Mean RMSE: 1.3484961620325961
Standard Deviation of RMSE: 0.031667951292980855

Ridge Regression:
Mean RMSE: 1.3484875793539237
Standard Deviation of RMSE: 0.031670634484959166

Lasso Regression:
Mean RMSE: 1.3829863269771512
Standard Deviation of RMSE: 0.044695785252681565

Decision Tree Regressor:
Mean RMSE: 1.6104586726914878
Standard Deviation of RMSE: 0.01943882302626807

Random Forest Regressor:
Mean RMSE: 1.170658864033514
Standard Deviation of RMSE: 0.022456109320973305

Best Model: Random Forest Regressor
Test RMSE: 1.1155915778940833


In [73]:
# Define the hyperparameters and their possible values to search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create a Random Forest Regressor
rf_model = RandomForestRegressor()

# Create GridSearchCV with the model and parameter grid
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding RMSE
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)
print("Best Hyperparameters:", best_params)
print("Best RMSE:", best_rmse)

# Get the best model with the tuned hyperparameters
best_rf_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE with Best Model: {test_rmse}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best RMSE: 1.1432507075238962
Test RMSE with Best Model: 1.110721843482234


In [74]:
import pickle

# Save the trained Random Forest model to a file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)