In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [60]:
df = pd.read_csv('/content/drive/MyDrive/CodSoft Datasets/IMDb Movies India.csv', encoding='latin-1')

In [61]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [63]:
df=df.dropna(subset=['Rating'])

In [64]:
df['Director']=df['Director'].fillna('unkown')
df['Genre']=df['Genre'].fillna('unkown')
df['Actors']=df['Actor 1'].fillna('unkown')

In [65]:
print(df.head())

                                 Name    Year Duration  \
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min   
3                             #Yaaram  (2019)  110 min   
5                ...Aur Pyaar Ho Gaya  (1997)  147 min   
6                           ...Yahaan  (2005)  142 min   
8                  ?: A Question Mark  (2012)   82 min   

                       Genre  Rating  Votes        Director          Actor 1  \
1                      Drama     7.0      8   Gaurav Bakshi     Rasika Dugal   
3            Comedy, Romance     4.4     35      Ovais Khan          Prateik   
5     Comedy, Drama, Musical     4.7    827    Rahul Rawail       Bobby Deol   
6        Drama, Romance, War     7.4  1,086  Shoojit Sircar  Jimmy Sheirgill   
8  Horror, Mystery, Thriller     5.6    326   Allyson Patel        Yash Dave   

                  Actor 2          Actor 3           Actors  
1          Vivek Ghamande    Arvind Jangid     Rasika Dugal  
3              Ishita Raj  Siddhant Kapoor    

In [66]:
features = ['Director', 'Genre', 'Actors']
target = 'Rating'
X=df[features]
y=df[target]

In [67]:
X_train, y_train, X_test, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [68]:
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (6335, 3) (1584, 3)
Testing set shape: (6335,)


In [69]:
features=['Director', 'Genre', 'Actors']
target='Rating'
X=df[features]
y=df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), features)
    ])
model=Pipeline(steps=[('preprocessor',preprocessor),
                      ('classifier',RandomForestRegressor(n_estimators=100, random_state=42))])
model.fit(X_train, y_train.values)

In [70]:
y_pred=model.predict(X_test)

In [71]:
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

In [72]:
print("Mean Squared Error:", mse)
print("R-Squared:", r2)

Mean Squared Error: 1.5198242468240297
R-Squared: 0.18251447205527727


In [73]:
comparison_df=pd.DataFrame({'Actual Rating':y_test,'Predicted Rating':y_pred})

In [74]:
print("Actual vs Predicted Ratings:")
print(comparison_df.head())

Actual vs Predicted Ratings:
       Actual Rating  Predicted Rating
9456             3.3          4.719000
14816            5.3          6.199000
3213             5.7          4.895267
3778             7.2          6.442000
5775             3.5          4.489000
