<a href="https://colab.research.google.com/github/hira-14/movie_recommender/blob/main/02_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import pickle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
DATA_PATH = '/content/drive/MyDrive/ml-1m/ml-1m'

In [None]:

df = pd.read_pickle(DATA_PATH + '/raw_merged.pkl')
print("Initial shape:", df.shape)

Initial shape: (1000209, 12)


In [None]:
df.columns

Index(['user_id', 'movie_id', 'rating', 'timestamp', 'gender', 'age',
       'occupation', 'zipcode', 'title', 'genres', 'year', 'age_group'],
      dtype='object')

In [None]:
# Genre Encoding & Release Year Extraction
# One-hot encode genres
print("Original genres:", df['genres'].iloc[0])
genre_dummies = df['genres'].str.get_dummies(sep='|')
df = pd.concat([df, genre_dummies], axis=1)

# Extract release year from title
df['release_year'] = (
    df['title']
    .str.extract(r'\((\d{4})\)')[0]
    .astype(float)
)

# Handle missing years (if any)
missing_year_count = df['release_year'].isnull().sum()
print(f"Movies with missing release year: {missing_year_count}")
df['release_year'] = df['release_year'].fillna(0).astype(int)

print("\nAfter feature engineering:")
print("Genre columns:", [col for col in df.columns if col in genre_dummies.columns][:5], "...")
print("Release years:", df['release_year'].value_counts().head())
display(df[['title', 'genres', 'release_year'] + list(genre_dummies.columns[:3])].head(3))

Original genres: Drama
Movies with missing release year: 0

After feature engineering:
Genre columns: ['Action', 'Adventure', 'Animation', "Children's", 'Comedy'] ...
Release years: release_year
1999    86833
1998    68226
1997    65402
1995    60754
1996    59382
Name: count, dtype: int64


Unnamed: 0,title,genres,release_year,Action,Adventure,Animation
0,One Flew Over the Cuckoo's Nest (1975),Drama,1975,0,0,0
1,James and the Giant Peach (1996),Animation|Children's|Musical,1996,0,0,1
2,My Fair Lady (1964),Musical|Romance,1964,0,0,0


In [None]:
# TF-IDF on Movie Titles
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=2000,    # Limit to top 2000 terms
    stop_words='english', # Remove common English words
    ngram_range=(1, 2)    # Include 1-word and 2-word phrases
)

tfidf_matrix = tfidf.fit_transform(df['title'])

pickle.dump(tfidf, open(DATA_PATH + '/tfidf_title.pkl', 'wb'))
sp.save_npz(DATA_PATH + '/tfidf_title_matrix.npz', tfidf_matrix)

print("TF-IDF Matrix shape:", tfidf_matrix.shape)
print("Sample feature names:", tfidf.get_feature_names_out()[:10])
print("First movie title:", df['title'].iloc[0])
print("First TF-IDF vector (non-zero elements):")
print(tfidf_matrix[0].toarray()[0, :5])  # First 5 elements of first document

TF-IDF Matrix shape: (1000209, 2000)
Sample feature names: ['000' '10' '10 things' '101' '101 dalmatians' '12' '12 angry' '13'
 '13 1995' '13th']
First movie title: One Flew Over the Cuckoo's Nest (1975)
First TF-IDF vector (non-zero elements):
[0. 0. 0. 0. 0.]


In [None]:
# Interaction Features
# Rating counts per user and movie
user_counts = df.groupby('user_id').size().rename('user_total_ratings')
movie_counts = df.groupby('movie_id').size().rename('movie_total_ratings')
df = df.join(user_counts, on='user_id').join(movie_counts, on='movie_id')

df = df.sort_values(['user_id', 'timestamp'])

df['prev_ts'] = df.groupby('user_id')['timestamp'].shift(1)
df['recency_days'] = (df['timestamp'] - df['prev_ts']).dt.days
df['recency_days'] = df['recency_days'].fillna(0).astype(int)

df['session_within_7d'] = (df['recency_days'] <= 7).astype(int)

print("\nInteraction features added:")
print("User rating counts:", df['user_total_ratings'].describe())
print("Recency days:", df['recency_days'].value_counts().head())
print("Session flags:", df['session_within_7d'].value_counts())
display(df[['user_id', 'movie_id', 'timestamp', 'prev_ts', 'recency_days', 'session_within_7d']].head(5))


Interaction features added:
User rating counts: count    1.000209e+06
mean     3.899080e+02
std      3.247402e+02
min      2.000000e+01
25%      1.470000e+02
50%      3.020000e+02
75%      5.440000e+02
max      2.314000e+03
Name: user_total_ratings, dtype: float64
Recency days: recency_days
0    985533
1      1705
2      1101
3       913
6       718
Name: count, dtype: int64
Session flags: session_within_7d
1    991909
0      8300
Name: count, dtype: int64


Unnamed: 0,user_id,movie_id,timestamp,prev_ts,recency_days,session_within_7d
31,1,3186,2000-12-31 22:00:19,NaT,0,1
22,1,1270,2000-12-31 22:00:55,2000-12-31 22:00:19,0,1
27,1,1721,2000-12-31 22:00:55,2000-12-31 22:00:55,0,1
37,1,1022,2000-12-31 22:00:55,2000-12-31 22:00:55,0,1
24,1,2340,2000-12-31 22:01:43,2000-12-31 22:00:55,0,1


In [None]:
# Save Processed Data

df.to_pickle(DATA_PATH + '/fe_data.pkl')

interaction_cols = [
    'user_id', 'movie_id', 'user_total_ratings', 'movie_total_ratings',
    'recency_days', 'session_within_7d'
]
df[interaction_cols].to_csv(DATA_PATH + '/interaction_features.csv', index=False)

print("Saved files:")
print(f"- Full dataset: {DATA_PATH}/fe_data.pkl")
print(f"- Interaction features: {DATA_PATH}/interaction_features.csv")

Saved files:
- Full dataset: /content/drive/MyDrive/ml-1m/ml-1m/fe_data.pkl
- Interaction features: /content/drive/MyDrive/ml-1m/ml-1m/interaction_features.csv
