IMPORTING THE LIBRARIES

In [2]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

LOAD THE MERGED DATASET

In [3]:
df = pd.read_csv("merged_movies.csv")

In [None]:
DATA PREPROCESSING

In [4]:
df = df[['movie_id','title_x','overview','genres','keywords','cast','crew']]

# --- Step 2: Handle missing values ---
df.dropna(inplace=True)

# --- Step 3: Convert JSON-like strings into lists ---
def convert(obj):
    L = []
    try:
        for i in ast.literal_eval(obj):
            L.append(i['name'])
    except:
        pass
    return L

df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)

# --- Step 4: Extract top 3 cast members ---
def convert_cast(obj):
    L = []
    try:
        count = 0
        for i in ast.literal_eval(obj):
            if count < 3:  # take top 3 actors
                L.append(i['name'])
                count += 1
            else:
                break
    except:
        pass
    return L

df['cast'] = df['cast'].apply(convert_cast)

# --- Step 5: Extract director from crew ---
def fetch_director(obj):
    L = []
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                L.append(i['name'])
                break
    except:
        pass
    return L

df['crew'] = df['crew'].apply(fetch_director)

# --- Step 6: Clean overview text ---
df['overview'] = df['overview'].apply(lambda x: x.split())

# --- Step 7: Normalize text (remove spaces + lowercase) ---
def clean_text(x):
    return [i.replace(" ", "").lower() for i in x]

df['genres'] = df['genres'].apply(clean_text)
df['keywords'] = df['keywords'].apply(clean_text)
df['cast'] = df['cast'].apply(clean_text)
df['crew'] = df['crew'].apply(clean_text)
df['overview'] = df['overview'].apply(lambda x: [i.lower() for i in x])

# --- Step 8: Create tags column ---
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

# --- Step 9: Final dataset ---
new_df = df[['movie_id','title_x','tags']]
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df.rename(columns={'title_x': 'title'}, inplace=True)

# --- Step 10: Save preprocessed dataset ---
new_df.to_csv("preprocessed_movies.csv", index=False)
print("✅ Preprocessing complete! Saved as preprocessed_movies.csv")

✅ Preprocessing complete! Saved as preprocessed_movies.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns={'title_x': 'title'}, inplace=True)


LOAD THE PREPROCESSED DATASET

In [5]:
df = pd.read_csv("preprocessed_movies.csv")

In [6]:
def convert_to_string(x):
    if isinstance(x, str):
        try:
            x_eval = ast.literal_eval(x)
            if isinstance(x_eval, list):
                return " ".join(x_eval)
            else:
                return x
        except:
            return x
    else:
        if isinstance(x, list):
            return " ".join(x)
        else:
            return str(x)

df['tags'] = df['tags'].apply(convert_to_string)

# --- Step 11: Vectorize tags using CountVectorizer ---
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()

# --- Step 12: Compute cosine similarity matrix ---
similarity = cosine_similarity(vectors)

# --- Step 13: Recommendation function ---
def recommend(movie):
    if movie not in df['title'].values:
        return f" Movie '{movie}' not found."

    index = df[df['title'] == movie].index[0]
    distances = list(enumerate(similarity[index]))
    movies = sorted(distances, key=lambda x: x[1], reverse=True)[1:11]

    recommended = [df.iloc[i[0]].title for i in movies]
    print(f"\n🎬 Top 10 recommendations for '{movie}':")
    for i, title in enumerate(recommended, 1):
        print(f"{i}. {title}")
    return recommended

# --- Step 14: Example usage ---
recommend("Avatar")  # Try with any movie in your dataset

# --- Step 15: Save model and similarity matrix ---
pickle.dump(df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

print("✅ Recommendation model saved!")


🎬 Top 10 recommendations for 'Avatar':
1. Titan A.E.
2. Ender's Game
3. Battle: Los Angeles
4. Independence Day
5. The Lovers
6. Edge of Tomorrow
7. Jupiter Ascending
8. The Host
9. The Fifth Element
10. Star Trek Into Darkness
✅ Recommendation model saved!
