In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("C:/Users/JAANYA RAHEJA/Downloads/archive (10)/movie_dataset.csv")


In [2]:
df.isnull().sum()  # Identify missing values
df.fillna('', inplace=True)  # Simple approach to fill missing values with empty strings


In [3]:
df['release_date'] = pd.to_datetime(df['release_date'])


In [7]:
df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d')


In [8]:
df['release_year'] = df['release_date'].dt.year
df['log_vote_count'] = df['vote_count'].apply(lambda x: np.log1p(x))
df['log_popularity'] = df['popularity'].apply(lambda x: np.log1p(x))


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

df['genres'] = df['genres'].apply(lambda x: x.split(','))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
df = pd.concat([df, genres_df], axis=1)


In [10]:
language_encoded = pd.get_dummies(df['original_language'], prefix='lang')
df = pd.concat([df, language_encoded], axis=1)


In [11]:
threshold = df['vote_average'].quantile(0.90)
df['top_rated'] = (df['vote_average'] >= threshold).astype(int)


In [12]:
features = df.drop(['id', 'title', 'release_date', 'genres', 'original_language', 'overview', 'popularity', 'vote_count', 'vote_average', 'top_rated'], axis=1)
target = df['top_rated']


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)



In [14]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

In [15]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.92      0.99      0.96      1678
           1       0.74      0.27      0.40       189

    accuracy                           0.92      1867
   macro avg       0.83      0.63      0.68      1867
weighted avg       0.90      0.92      0.90      1867

Accuracy: 0.9164434922335297


In [16]:
importances = model.feature_importances_
feature_names = features.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values(by='importance', ascending=False)
print(feature_importance_df)


           feature    importance
1   log_vote_count  2.469285e-01
0     release_year  2.074552e-01
2   log_popularity  1.652537e-01
47         lang_en  2.918777e-02
26           Drama  1.869291e-02
..             ...           ...
79         lang_tn  3.683535e-06
41         lang_bs  2.898744e-06
64         lang_km  2.071262e-07
54         lang_ga  0.000000e+00
81         lang_uk  0.000000e+00

[83 rows x 2 columns]


In [20]:
def find_top_rated_movies(df, threshold=0.9):
    """
    Function to find the top-rated movies in the DataFrame.

    Parameters:
    - df (DataFrame): Input DataFrame containing movie data.
    - threshold (float): Threshold for defining top-rated movies based on vote_average.
                         Should be between 0 and 1. Default is 0.9.

    Returns:
    - top_rated_df (DataFrame): DataFrame containing top-rated movies.
    """
    # Calculate threshold for vote_average
    vote_average_threshold = df['vote_average'].quantile(threshold)

    # Filter movies above the threshold
    top_rated_df = df[df['vote_average'] >= vote_average_threshold]

    return top_rated_df[['title', 'release_date', 'genres', 'vote_average']]

# Example usage:
top_rated_movies = find_top_rated_movies(df)
print(top_rated_movies)


                        title release_date  \
0    The Shawshank Redemption   1994-09-23   
1               The Godfather   1972-03-14   
2       The Godfather Part II   1974-12-20   
3            Schindler's List   1993-12-15   
4                12 Angry Men   1957-04-10   
..                        ...          ...   
951          A Man Called Ove   2015-12-25   
952                  Deadpool   2016-02-09   
955       Steamboat Bill, Jr.   1928-05-09   
956                The Killer   1989-03-24   
957           Children of Men   2006-09-22   

                                            genres  vote_average  
0                                  [Drama,  Crime]         8.703  
1                                  [Drama,  Crime]         8.695  
2                                  [Drama,  Crime]         8.577  
3                          [Drama,  History,  War]         8.567  
4                                          [Drama]         8.500  
..                                           