<a href="https://colab.research.google.com/github/hira-14/movie_recommender/blob/main/05_Session_Based_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import scipy.sparse as sp
import pickle
from scipy.sparse import csr_matrix, save_npz
from collections import defaultdict

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DATA_PATH = '/content/drive/MyDrive/ml-1m/ml-1m'

In [4]:
df = pd.read_pickle(DATA_PATH + '/fe_data.pkl')

In [5]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zipcode,title,genres,...,Sci-Fi,Thriller,War,Western,release_year,user_total_ratings,movie_total_ratings,prev_ts,recency_days,session_within_7d
31,1,3186,4,2000-12-31 22:00:19,F,1,10,48067,"Girl, Interrupted (1999)",Drama,...,0,0,0,0,1999,53,431,NaT,0,1
22,1,1270,5,2000-12-31 22:00:55,F,1,10,48067,Back to the Future (1985),Comedy|Sci-Fi,...,1,0,0,0,1985,53,2583,2000-12-31 22:00:19,0,1
27,1,1721,4,2000-12-31 22:00:55,F,1,10,48067,Titanic (1997),Drama|Romance,...,0,0,0,0,1997,53,1546,2000-12-31 22:00:55,0,1
37,1,1022,5,2000-12-31 22:00:55,F,1,10,48067,Cinderella (1950),Animation|Children's|Musical,...,0,0,0,0,1950,53,577,2000-12-31 22:00:55,0,1
24,1,2340,3,2000-12-31 22:01:43,F,1,10,48067,Meet Joe Black (1998),Romance,...,0,0,0,0,1998,53,344,2000-12-31 22:00:55,0,1


In [9]:
df = df.sort_values(['user_id', 'timestamp'])


In [10]:
print("Data loaded. Shape:", df.shape)
print("Columns:", df.columns.tolist())

Data loaded. Shape: (1000209, 36)
Columns: ['user_id', 'movie_id', 'rating', 'timestamp', 'gender', 'age', 'occupation', 'zipcode', 'title', 'genres', 'year', 'age_group', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'release_year', 'user_total_ratings', 'movie_total_ratings', 'prev_ts', 'recency_days', 'session_within_7d']


In [11]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zipcode,title,genres,...,Sci-Fi,Thriller,War,Western,release_year,user_total_ratings,movie_total_ratings,prev_ts,recency_days,session_within_7d
31,1,3186,4,2000-12-31 22:00:19,F,1,10,48067,"Girl, Interrupted (1999)",Drama,...,0,0,0,0,1999,53,431,NaT,0,1
22,1,1270,5,2000-12-31 22:00:55,F,1,10,48067,Back to the Future (1985),Comedy|Sci-Fi,...,1,0,0,0,1985,53,2583,2000-12-31 22:00:19,0,1
27,1,1721,4,2000-12-31 22:00:55,F,1,10,48067,Titanic (1997),Drama|Romance,...,0,0,0,0,1997,53,1546,2000-12-31 22:00:55,0,1
37,1,1022,5,2000-12-31 22:00:55,F,1,10,48067,Cinderella (1950),Animation|Children's|Musical,...,0,0,0,0,1950,53,577,2000-12-31 22:00:55,0,1
24,1,2340,3,2000-12-31 22:01:43,F,1,10,48067,Meet Joe Black (1998),Romance,...,0,0,0,0,1998,53,344,2000-12-31 22:00:55,0,1


In [18]:
# Cell 2: Session Identification (Revised Version)
# Create session IDs based on the existing session_within_7d flag
df['session_num'] = df.groupby('user_id')['session_within_7d'].transform(
    lambda x: (x == 0).cumsum()
)
df['session_id'] = 'u' + df['user_id'].astype(str) + '_s' + df['session_num'].astype(str)

# Filter out single-item sessions
session_sizes = df.groupby('session_id').size()
valid_sessions = session_sizes[session_sizes > 1].index
df = df[df['session_id'].isin(valid_sessions)]

print(f"Sessions identified: {df['session_id'].nunique()}")
print(f"Average session length: {df.groupby('session_id').size().mean():.2f}")
print("Sample session transitions:")
display(df[['user_id', 'movie_id', 'title', 'timestamp', 'session_id']].head(10))

Sessions identified: 12232
Average session length: 81.60
Sample session transitions:


Unnamed: 0,user_id,movie_id,title,timestamp,session_id
31,1,3186,"Girl, Interrupted (1999)",2000-12-31 22:00:19,u1_s0
22,1,1270,Back to the Future (1985),2000-12-31 22:00:55,u1_s0
27,1,1721,Titanic (1997),2000-12-31 22:00:55,u1_s0
37,1,1022,Cinderella (1950),2000-12-31 22:00:55,u1_s0
24,1,2340,Meet Joe Black (1998),2000-12-31 22:01:43,u1_s0
36,1,1836,"Last Days of Disco, The (1998)",2000-12-31 22:02:52,u1_s0
3,1,3408,Erin Brockovich (2000),2000-12-31 22:04:35,u1_s0
7,1,2804,"Christmas Story, A (1983)",2000-12-31 22:11:59,u1_s0
47,1,1207,To Kill a Mockingbird (1962),2000-12-31 22:11:59,u1_s0
0,1,1193,One Flew Over the Cuckoo's Nest (1975),2000-12-31 22:12:40,u1_s0


In [21]:
# Cell 3: Build Transition Counts
# Create ordered movie sequences per session
sessions = df.groupby('session_id')['movie_id'].apply(list).reset_index(name='sequence')

# Build transition counts
transition_counts = defaultdict(int)

for seq in sessions['sequence']:
    for i in range(len(seq) - 1):
        transition_counts[(seq[i], seq[i+1])] += 1

# Create matrix indices
all_movies = df['movie_id'].unique()
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(all_movies)}

# Initialize sparse matrix
n_movies = len(all_movies)
transition_matrix = np.zeros((n_movies, n_movies), dtype=np.float32)

# Fill matrix with counts
for (from_movie, to_movie), count in transition_counts.items():
    i = movie_to_idx.get(from_movie)
    j = movie_to_idx.get(to_movie)
    if i is not None and j is not None:
        transition_matrix[i, j] = count

print(f"Transition matrix shape: {transition_matrix.shape}")
print(f"Total transitions: {int(transition_matrix.sum())}")
print(f"Sparsity: {100 * (1 - np.count_nonzero(transition_matrix) / np.prod(transition_matrix.shape)):.2f}%")

Transition matrix shape: (3704, 3704)
Total transitions: 985869
Sparsity: 96.30%


In [23]:
# Cell 4: Convert to Probabilities
# Normalize rows to get transition probabilities
row_sums = transition_matrix.sum(axis=1)
transition_probs = np.divide(transition_matrix, row_sums[:, np.newaxis],
                             out=np.zeros_like(transition_matrix),
                             where=row_sums[:, np.newaxis] != 0)

# Handle NaN values
transition_probs = np.nan_to_num(transition_probs, nan=0)

# Convert to sparse format for storage
sparse_transition_probs = csr_matrix(transition_probs)
save_npz(DATA_PATH + '/session_transitions.npz', sparse_transition_probs)

print("Transition probability matrix saved")
print("Sample transitions for movie 1 (Toy Story):")
movie1_idx = movie_to_idx[1]
top5_next = np.argsort(transition_probs[movie1_idx])[::-1][:5]
for idx in top5_next:
    if transition_probs[movie1_idx, idx] > 0:
        movie_id = all_movies[idx]
        title = df[df['movie_id'] == movie_id]['title'].iloc[0]
        print(f"{title}: {transition_probs[movie1_idx, idx]:.4f}")

Transition probability matrix saved
Sample transitions for movie 1 (Toy Story):
Toy Story 2 (1999): 0.0614
Groundhog Day (1993): 0.0507
Beauty and the Beast (1991): 0.0295
Snow White and the Seven Dwarfs (1937): 0.0174
Babe (1995): 0.0164


In [24]:
# Cell 5: Generate Recommendations
def get_session_recommendations(user_id, n=10):
    """Generate session-based recommendations for a user"""
    # Get user's most recent session
    user_sessions = df[df['user_id'] == user_id].sort_values('timestamp', ascending=False)
    if user_sessions.empty:
        return pd.Series([], name='title')

    last_session_id = user_sessions['session_id'].iloc[0]
    session_movies = user_sessions[user_sessions['session_id'] == last_session_id]

    if len(session_movies) == 0:
        return pd.Series([], name='title')

    # Get last movie in session
    last_movie = session_movies.sort_values('timestamp').iloc[-1]['movie_id']
    last_idx = movie_to_idx.get(last_movie)

    if last_idx is None or transition_probs[last_idx].sum() == 0:
        return pd.Series([], name='title')

    # Get top transitions
    seen_movies = set(user_sessions['movie_id'])
    probs = transition_probs[last_idx].copy()

    # Set seen movies probability to zero
    for movie_id in seen_movies:
        if movie_id in movie_to_idx:
            seen_idx = movie_to_idx[movie_id]
            probs[seen_idx] = 0

    # Get top recommendations
    top_indices = np.argsort(probs)[::-1][:n]
    recommendations = []
    for idx in top_indices:
        if probs[idx] > 0:
            movie_id = all_movies[idx]
            title = df[df['movie_id'] == movie_id]['title'].iloc[0]
            recommendations.append(title)

    return pd.Series(recommendations, name='title')

# Test with sample user
sample_user = 1
session_recs = get_session_recommendations(sample_user, n=10)
print(f"\nTop 10 session-based recommendations for user {sample_user}:")
print(session_recs)


Top 10 session-based recommendations for user 1:
0    Aladdin and the King of Thieves (1996)
1                          Space Jam (1996)
2                      Pete's Dragon (1977)
3               Return of Jafar, The (1993)
4                      Dirty Dancing (1987)
5       Transformers: The Movie, The (1986)
6         Great Mouse Detective, The (1986)
7                     Goofy Movie, A (1995)
8                    Pagemaster, The (1994)
9              All Dogs Go to Heaven (1989)
Name: title, dtype: object


In [27]:
# Cell 6: Save Results and Validation
# Save recommendations
session_recs_df = pd.DataFrame({
    'user_id': sample_user,
    'movie_id': [df[df['title'] == title]['movie_id'].iloc[0] for title in session_recs],
    'title': session_recs.values,
    'rank': range(1, len(session_recs)+1)
})
session_recs_df.to_csv(DATA_PATH + '/session_recs_user1.csv', index=False)

# Spot-check recommendations
print("\nSpot-checking recommendations:")
user_last_movie = df[df['user_id'] == sample_user].sort_values('timestamp').iloc[-1]['title']
print(f"User {sample_user}'s last watched movie: {user_last_movie}")

# Check what movies are typically watched after a blockbuster
def common_transitions(movie_title, n=5):
    try:
        movie_id = df[df['title'] == movie_title]['movie_id'].iloc[0]
        idx = movie_to_idx.get(movie_id)
        if idx is None:
            return pd.DataFrame()

        top_indices = np.argsort(transition_probs[idx])[::-1][1:n+1]
        transitions = []
        for j in top_indices:
            if transition_probs[idx, j] > 0:
                movie_id_j = all_movies[j]
                title_j = df[df['movie_id'] == movie_id_j]['title'].iloc[0]
                transitions.append((title_j, transition_probs[idx, j]))
        return pd.DataFrame(transitions, columns=['movie', 'probability'])
    except:
        return pd.DataFrame()

print("\nCommon transitions after 'The Matrix (1999)':")
print(common_transitions('The Matrix (1999)'))

print("\nCommon transitions after 'Toy Story (1995)':")
print(common_transitions('Toy Story (1995)'))

# Compare to other models
content_recs = pd.read_csv(DATA_PATH + '/content_recs_sample.csv')
cf_recs = pd.read_csv(DATA_PATH + '/cf_recs_user1_svd.csv')

print("\nModel comparison for user 1:")
print("- Content-based:", content_recs['title'].tolist()[:3])
print("- Collaborative:", cf_recs['title'].tolist()[:3])
print("- Session-based:", session_recs.values.tolist()[:3])


Spot-checking recommendations:
User 1's last watched movie: Pocahontas (1995)

Common transitions after 'The Matrix (1999)':
Empty DataFrame
Columns: []
Index: []

Common transitions after 'Toy Story (1995)':
                                    movie  probability
0                    Groundhog Day (1993)     0.050749
1             Beauty and the Beast (1991)     0.029483
2  Snow White and the Seven Dwarfs (1937)     0.017400
3                             Babe (1995)     0.016433
4              Shakespeare in Love (1998)     0.015950

Model comparison for user 1:
- Content-based: ['Wide Awake (1998)', 'Babe (1995)', 'Pollyanna (1960)']
- Collaborative: ['Sanjuro (1962)', 'Gladiator (2000)', 'Bridge on the River Kwai, The (1957)']
- Session-based: ['Aladdin and the King of Thieves (1996)', 'Space Jam (1996)', "Pete's Dragon (1977)"]
