In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
import pandas as pd

# Load the dataset
file_path = '/Users/kaivalyasatav/Documents/Codesoft/Task2 Movie Rating Prediction/IMDb Movies India.csv' 
movie_data = pd.read_csv(file_path, encoding='latin1')

# Step 1: Handle missing values for categorical columns
categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
movie_data[categorical_columns] = movie_data[categorical_columns].fillna('Unknown')

# Step 2: Handle missing values in Rating
movie_data['Rating'] = movie_data['Rating'].fillna(movie_data['Rating'].mean())

# Step 3: Handle non-numeric and missing values in Votes
movie_data['Votes'] = pd.to_numeric(movie_data['Votes'], errors='coerce').fillna(0)

# Step 4: Convert and clean the Duration column
# Extract numeric values from Duration and handle missing values
if 'Duration' in movie_data.columns:
    movie_data['Duration'] = movie_data['Duration'].astype(str).str.extract('(\d+)').astype(float)
    movie_data['Duration'] = movie_data['Duration'].fillna(movie_data['Duration'].mean())
else:
    print("Duration column not found in the dataset.")
    movie_data['Duration'] = 0  # Default value if missing

# Step 5: Check for remaining missing values
print("Missing values per column:")
print(movie_data.isnull().sum())

# Optional: Display the first few rows to confirm the changes
print(movie_data.head())

Missing values per column:
Name          0
Year        528
Duration      0
Genre         0
Rating        0
Votes         0
Director      0
Actor 1       0
Actor 2       0
Actor 3       0
dtype: int64
                                 Name    Year    Duration            Genre  \
0                                         NaN  128.126519            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109.000000            Drama   
2                         #Homecoming  (2021)   90.000000   Drama, Musical   
3                             #Yaaram  (2019)  110.000000  Comedy, Romance   
4                   ...And Once Again  (2010)  105.000000            Drama   

     Rating  Votes            Director       Actor 1             Actor 2  \
0  5.841621    0.0       J.S. Randhawa      Manmauji              Birbal   
1  7.000000    8.0       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2  5.841621    0.0  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3  4.400000   35.0         

In [4]:
# Define features and target
features = ['Duration', 'Genre', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
target = 'Rating'

# Extract features (X) and target variable (y)
X = movie_data[features]
y = movie_data[target]

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

# Define categorical and numeric columns
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numeric_features = ['Duration', 'Votes']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),  # Keep numeric columns as is
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # Encode categorical columns
    ]
)

# Combine preprocessing with a regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [6]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Train the pipeline
pipeline.fit(X_train, y_train)

In [8]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 2.4183952551614216
R^2 Score: -1.512677515208706


In [9]:
import joblib

# Save the pipeline as a file
joblib.dump(pipeline, 'movie_rating_model.pkl')

# To load the model later:
# pipeline = joblib.load('movie_rating_model.pkl')

['movie_rating_model.pkl']

In [10]:
# Example new movie
new_movie = pd.DataFrame({
    'Duration': [120],
    'Genre': ['Action'],
    'Votes': [50000],
    'Director': ['Christopher Nolan'],
    'Actor 1': ['Leonardo DiCaprio'],
    'Actor 2': ['Joseph Gordon-Levitt'],
    'Actor 3': ['Ellen Page']
})

# Predict rating
predicted_rating = pipeline.predict(new_movie)
print(f"Predicted Rating: {predicted_rating[0]}")

Predicted Rating: -2.0115565468122103
