In [1]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv('/kaggle/input/finaldata/processed-movie.csv')
df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,...,company,runtime,profit_ratio,performance_class,score_cat,budget_cat,director_success_score,actor_success_score,director_category,actor_category
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,...,Warner Bros.,146.0,2.473620,Hit,Excellent,Mid Budget,6.0,25.0,Medium-Director,High-Performer-Actor
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,...,Columbia Pictures,104.0,13.078468,All-Time Blockbuster,Average,Low Budget,20.0,17.0,High-Performer-Director,Mid-Actor
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,...,Lucasfilm,124.0,29.909726,All-Time Blockbuster,Excellent,Mid Budget,19.0,30.0,High-Performer-Director,High-Performer-Actor
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,...,Paramount Pictures,88.0,23.843868,All-Time Blockbuster,Very Good,Low Budget,36.0,22.0,Legendary-Director,High-Performer-Actor
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,...,Orion Pictures,98.0,6.641057,Blockbuster,Very Good,Mid Budget,16.0,34.0,High-Performer-Director,High-Performer-Actor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7657,Aloha Surf Hotel,Unknown Rating,Comedy,2020,"November 5, 2020 (United States)",7.1,14.0,Stefan C. Schaefer,Stefan C. Schaefer,Augie Tulba,...,Abominable Pictures,90.0,0.985647,Flop,Very Good,Mid Budget,0.0,0.0,Low-Director,Low-Actor
7658,More to Life,Unknown Rating,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,...,Unknown Company,90.0,2886.536714,All-Time Blockbuster,Poor,Low Budget,15.0,15.0,High-Performer-Director,Mid-Actor
7659,Dream Round,Unknown Rating,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,...,Cactus Blue Entertainment,90.0,0.985647,Flop,Poor,Mid Budget,0.0,0.0,Low-Director,Low-Actor
7660,Saving Mbango,Unknown Rating,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,...,Embi Productions,104.0,343.927779,All-Time Blockbuster,Average,Low Budget,15.0,15.0,High-Performer-Director,Mid-Actor


In [3]:
df['performance_class'].value_counts()

performance_class
Flop                    3607
Hit                     1452
Super Hit                883
Average                  738
All-Time Blockbuster     502
Blockbuster              480
Name: count, dtype: int64

In [4]:
df_balanced = df.copy()
def merge_performance(r):
    if r in ['Flop', 'Average']:
        return 'Low/Flop'
    elif r in ['Hit', 'Super Hit']:
        return 'Hit/Success'
    elif r in ['Blockbuster', 'All-Time Blockbuster']:
        return 'Big Hit'

df_balanced['performance_class'] = df_balanced['performance_class'].apply(merge_performance)
df_balanced['performance_class'].value_counts()


performance_class
Low/Flop       4345
Hit/Success    2335
Big Hit         982
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt


features = [
    'budget_cat', 'score_cat', 'director_success_score', 'actor_success_score',
    'genre', 'runtime'
]

target = 'performance_class'



X = df_balanced[features].copy()
y = df_balanced[target].copy()


cat_cols = ['budget_cat', 'score_cat', 'genre']
num_cols = ['director_success_score', 'actor_success_score', 'runtime']


ohe = OneHotEncoder(sparse_output=False, drop='first') #encoding
X_cat = ohe.fit_transform(X[cat_cols])
X_cat_df = pd.DataFrame(X_cat, columns=ohe.get_feature_names_out(cat_cols), index=X.index)

X = X.drop(columns=cat_cols)
X = pd.concat([X, X_cat_df], axis=1)


scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_accs = []
test_accs = []
depths = range(1, 11)  # depths 1–20

print("Depth | Train Accuracy | Test Accuracy")
print("----------------------------------------")

for d in depths:
    model = DecisionTreeClassifier(
    max_depth=d,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42
    )
    
    model.fit(X_train, y_train)

    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)

    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"{d:>5} | {train_acc:.4f}        | {test_acc:.4f}")

# Best depth based on test accuracy
best_depth = depths[test_accs.index(max(test_accs))]
best_test_acc = max(test_accs)

print("\nBest depth based on test accuracy:", best_depth)
print("Best test accuracy:", best_test_acc)

model = DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf=5,random_state=42)
dt = model.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Depth | Train Accuracy | Test Accuracy
----------------------------------------
    1 | 0.5655        | 0.5714
    2 | 0.6616        | 0.6732
    3 | 0.5668        | 0.5897
    4 | 0.5668        | 0.5897
    5 | 0.6081        | 0.6354
    6 | 0.6644        | 0.6791
    7 | 0.6673        | 0.6765
    8 | 0.7096        | 0.7110
    9 | 0.7058        | 0.7078
   10 | 0.7137        | 0.7097

Best depth based on test accuracy: 8
Best test accuracy: 0.7110241356816699
[[ 73  94  24]
 [ 13 346  93]
 [ 10 223 657]]
              precision    recall  f1-score   support

     Big Hit       0.76      0.38      0.51       191
 Hit/Success       0.52      0.77      0.62       452
    Low/Flop       0.85      0.74      0.79       890

    accuracy                           0.70      1533
   macro avg       0.71      0.63      0.64      1533
weighted avg       0.74      0.70      0.70      1533



In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[136  55   0]
 [ 83 369   0]
 [ 62 247 581]]
              precision    recall  f1-score   support

     Big Hit       0.48      0.71      0.58       191
 Hit/Success       0.55      0.82      0.66       452
    Low/Flop       1.00      0.65      0.79       890

    accuracy                           0.71      1533
   macro avg       0.68      0.73      0.67      1533
weighted avg       0.80      0.71      0.72      1533



In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches

# Step 1: One-Hot Encode categorical features
genre_ohe = pd.get_dummies(df['genre'], prefix='genre')
director_ohe = pd.get_dummies(df['director_category'], prefix='dir')
actor_ohe = pd.get_dummies(df['actor_category'], prefix='actor')

# Step 2: Combine all OHE features
feature_matrix = pd.concat([genre_ohe, director_ohe, actor_ohe], axis=1)

# Step 3: Compute cosine similarity using inbuilt function
similarity_matrix = cosine_similarity(feature_matrix)

# Step 4: Recommendation function
def recommend(movie_title, top_n=5):
    # Find closest match
    matches = get_close_matches(movie_title, df['name'].values)
    if not matches:
        return f"Movie '{movie_title}' not found."
    
    idx = df[df['name'] == matches[0]].index[0]
    
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_indices = [i[0] for i in sim_scores[1:top_n+1]]  # top N similar movies
    return df['name'].iloc[top_indices]

# Test the recommender
print("Recommendations for 'Inception':")
print(recommend("Inception"))

print("\nRecommendations for 'Titanic':")
print(recommend("Titanic"))


print("\nRecommendations for 'Star Wars: Episode V - The Empire Strikes Back':")
print(recommend("Star Wars: Episode V - The Empire Strikes Back"))

Recommendations for 'Inception':
92     Indiana Jones and the Raiders of the Lost Ark
206                                     Blade Runner
233                                          Firefox
359                                    Sudden Impact
475                                   The Terminator
Name: name, dtype: object

Recommendations for 'Titanic':
225        Rocky III
339    Staying Alive
503      The Natural
656       Pale Rider
663         Rocky IV
Name: name, dtype: object

Recommendations for 'Star Wars: Episode V - The Empire Strikes Back':
254                                 Death Wish II
333    Star Wars: Episode VI - Return of the Jedi
394                                Uncommon Valor
524                                    Dreamscape
700                                     Tuff Turf
Name: name, dtype: object
