In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
file_path = 'C:/Users/VENUGOPAL BADRI/Downloads/IMDb Movies India.csv/IMDb Movies India.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Data Preprocessing
# Drop rows where 'Rating' (target) is missing
data = data.dropna(subset=['Rating'])

# Clean 'Year' column (remove parentheses and convert to numeric)
data['Year'] = data['Year'].str.extract('(\\d{4})').astype(float)

# Convert 'Duration' to numeric (remove 'min' and handle missing values)
data['Duration'] = data['Duration'].str.replace(' min', '').astype(float)

# Convert 'Votes' to numeric
data['Votes'] = data['Votes'].str.replace(',', '').astype(float)

# Select features and target
features = ['Year', 'Duration', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Votes']
X = data[features]
y = data['Rating']

# Handle missing values and categorical encoding
numerical_features = ['Year', 'Duration', 'Votes']
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define and train the Random Forest Regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Random Forest Regressor - Root Mean Squared Error: {rmse}')

# Function to predict rating for new data
def predict_movie_rating(input_data):
    df = pd.DataFrame([input_data], columns=features)
    return pipeline.predict(df)[0]

# Example prediction
new_movie = {
    'Year': 2023,
    'Duration': 120,
    'Genre': 'Drama',
    'Director': 'Example Director',
    'Actor 1': 'Example Actor 1',
    'Actor 2': 'Example Actor 2',
    'Actor 3': 'Example Actor 3',
    'Votes': 1500
}

predicted_rating = predict_movie_rating(new_movie)
print(f'Predicted Rating for the new movie: {predicted_rating}')


Random Forest Regressor - Root Mean Squared Error: 1.0827057917270941
Predicted Rating for the new movie: 7.1330000000000044
