<a href="https://colab.research.google.com/github/kaviya2478/Rotten_Tomatoes_Prediction_Model/blob/main/Rotten_Tomatoes_Movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Loading Data
def load_data(file_path):
    df = pd.read_excel(file_path)
    print(df.info())  # To inspect data
    return df

# Preprocessing the Data
def preprocess_data(df):
    # Droping columns that are not necessary for prediction
    df = df.drop(columns=['movie_title', 'movie_info', 'critics_consensus'], axis=1)

    # Filling the missing values
    df['runtime_in_minutes'].fillna(df['runtime_in_minutes'].median(), inplace=True)
    df['studio_name'].fillna('Unknown', inplace=True)
    df = df.dropna(subset=['audience_rating'])

    # Converting datetime columns to numeric values
    reference_date = pd.to_datetime("2020-01-01")
    df['in_theaters_date'] = (df['in_theaters_date'] - reference_date).dt.days
    df['on_streaming_date'] = (df['on_streaming_date'] - reference_date).dt.days

    # Encoding categorical features
    categorical_cols = ['rating', 'genre', 'directors', 'writers', 'cast', 'studio_name', 'tomatometer_status']
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    # Defining features (X) and target (y)
    X = df.drop(columns=['audience_rating'])
    y = df['audience_rating']

    return X, y, label_encoders

# Train-Test Split
def train_test_split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Training Random Forest Model
def train_random_forest(X_train, y_train, X_test, y_test):
    print("\nTraining Random Forest Model...")
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"Random Forest RMSE: {rmse:.2f}")
    print(f"Random Forest R2 Score: {r2:.2f}")
    return model

# Hyperparameter Tuning with Grid Search
def tune_hyperparameters(X_train, y_train):
    print("\nTuning Hyperparameters...")
    param_grid = {
        'n_estimators': [50, 100],  # Reduced grid size for faster execution
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                               cv=3, n_jobs=-1, verbose=2,
                               scoring='neg_root_mean_squared_error')
    grid_search.fit(X_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Saving the Model
def save_model(model, file_name):
    joblib.dump(model, file_name)
    print(f"Model saved as '{file_name}'.")

# Loading the Model
def load_model(file_name):
    return joblib.load(file_name)

# Main function
def main():
    # Step 1: Loading Data
    file_path = "Rotten_Tomatoes_Movies.xls"
    df = load_data(file_path)

    # Step 2: Preprocessing Data
    X, y, _ = preprocess_data(df)

    # Step 3: Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split_data(X, y)

    # Step 4: Training Random Forest Model
    base_model = train_random_forest(X_train, y_train, X_test, y_test)

    # Step 5: Hyperparameter Tuning
    optimized_model = tune_hyperparameters(X_train, y_train)

    # Step 6: Saving the Optimized Model
    save_model(optimized_model, 'optimized_random_forest.pkl')

    # Step 7: Loading and Using the Model
    loaded_model = load_model('optimized_random_forest.pkl')
    example_data = pd.DataFrame({
        'rating': [1],
        'genre': [3],
        'runtime_in_minutes': [120],
        'studio_name': [5],
        'tomatometer_status': [2],
        'tomatometer_rating': [80],
        'tomatometer_count': [200],
        'in_theaters_date': [730],
        'on_streaming_date': [1095],
        'cast': [0],
        'directors': [0],
        'writers': [0]
    })

    # Ensuring all columns match model's training columns
    trained_features = loaded_model.feature_names_in_
    missing_cols = set(trained_features) - set(example_data.columns)
    for col in missing_cols:
      example_data[col] = 0

    # Reordering the columns to match the training data
    example_data = example_data[trained_features]


    prediction = loaded_model.predict(example_data)
    print(f"Predicted Audience Rating: {prediction[0]:.2f}")

if __name__ == "__main__":
    main()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16638 entries, 0 to 16637
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   movie_title         16638 non-null  object        
 1   movie_info          16614 non-null  object        
 2   critics_consensus   8309 non-null   object        
 3   rating              16638 non-null  object        
 4   genre               16621 non-null  object        
 5   directors           16524 non-null  object        
 6   writers             15289 non-null  object        
 7   cast                16354 non-null  object        
 8   in_theaters_date    15823 non-null  datetime64[ns]
 9   on_streaming_date   16636 non-null  datetime64[ns]
 10  runtime_in_minutes  16483 non-null  float64       
 11  studio_name         16222 non-null  object        
 12  tomatometer_status  16638 non-null  object        
 13  tomatometer_rating  16638 non-null  int64     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['runtime_in_minutes'].fillna(df['runtime_in_minutes'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['studio_name'].fillna('Unknown', inplace=True)



Training Random Forest Model...
Random Forest RMSE: 13.77
Random Forest R2 Score: 0.54

Tuning Hyperparameters...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Model saved as 'optimized_random_forest.pkl'.
Predicted Audience Rating: 67.33
