In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('../data/ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


In [3]:
ratings_small = pd.read_csv('../data/ratings_small.csv')
ratings_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


Loading ratings files...
Small ratings shape: (100004, 4)
Full ratings shape: (26024289, 4)

Analyzing user overlap...
Number of matching records: 99676
Percentage of small ratings found in full ratings: 99.67%

Users in small_ratings: 671
Users from small_ratings found in full_ratings: 670
Users from small_ratings NOT found in full_ratings: 1
Unmatched users: {np.int64(519)}

First 10 unmatched user IDs: [np.int64(519)]

Sample records from unmatched users:
       userId  movieId  rating   timestamp       composite_key
74759     519       50     4.0  1468758676   50_4.0_1468758676
74760     519      104     4.0  1469927080  104_4.0_1469927080
74761     519      111     4.0  1468758621  111_4.0_1468758621
74762     519      216     4.0  1469927082  216_4.0_1469927082
74763     519      223     4.5  1468759080  223_4.5_1468759080
74764     519      235     4.0  1468758651  235_4.0_1468758651
74765     519      260     5.0  1468928013  260_5.0_1468928013
74766     519      288     4.5  1

KeyError: "['movieId', 'rating', 'timestamp'] not in index"

In [5]:
import pandas as pd
from collections import Counter

def merge_ratings_files():
    """
    Merge ratings_small.csv and ratings.csv.

    - Row-level matches are detected by 'composite_key' (movieId, rating, timestamp).
    - For users that exist in full_ratings, ALL their small_ratings rows (matched or not)
      are mapped to their full userId.
    - Users that don't exist in full_ratings get new userIds starting after the max in full.
    """

    print("Loading ratings files...")

    small_ratings = pd.read_csv('../data/ratings_small.csv')
    print(f"Small ratings shape: {small_ratings.shape}")

    full_ratings = pd.read_csv('../data/ratings.csv')
    print(f"Full ratings shape: {full_ratings.shape}")

    # Build composite keys (format rating to 1 decimal to be safe)
    small_ratings['composite_key'] = (
        small_ratings['movieId'].astype(str) + '_' +
        small_ratings['rating'].map(lambda x: f"{float(x):.1f}") + '_' +
        small_ratings['timestamp'].astype(str)
    )
    full_ratings['composite_key'] = (
        full_ratings['movieId'].astype(str) + '_' +
        full_ratings['rating'].map(lambda x: f"{float(x):.1f}") + '_' +
        full_ratings['timestamp'].astype(str)
    )

    print("\nAnalyzing user overlap...")

    # Inner join on composite_key; overlapping cols get suffixes
    matches = small_ratings.merge(
        full_ratings, on='composite_key', how='inner', suffixes=('_small', '_full')
    )

    print(f"Number of matching records: {len(matches)}")
    print(f"Percentage of small ratings found in full ratings: {len(matches)/len(small_ratings)*100:.2f}%")

    small_users = set(small_ratings['userId'].unique())
    matched_users = set(matches['userId_small'].unique()) if len(matches) else set()
    unmatched_users = small_users - matched_users

    print(f"\nUsers in small_ratings: {len(small_users)}")
    print(f"Users from small_ratings found in full_ratings: {len(matched_users)}")
    print(f"Users from small_ratings NOT found in full_ratings: {len(unmatched_users)}")
    print(f"Unmatched users: {unmatched_users}")

    if unmatched_users:
        first10 = sorted(list(unmatched_users))[:10]
        print(f"\nFirst 10 unmatched user IDs: {first10}")
        print("\nSample records from unmatched users:")
        print(small_ratings[small_ratings['userId'].isin(unmatched_users)].head(10))

    # Build a stable mapping from userId_small -> userId_full using matches
    # If (pathologically) a small user maps to multiple full users, pick the most common
    if len(matches) > 0:
        pairs = matches[['userId_small', 'userId_full']]
        mapping = {}
        for uid_small, group in pairs.groupby('userId_small'):
            counts = Counter(group['userId_full'])
            mapping[uid_small] = counts.most_common(1)[0][0]  # pick the modal mapping
    else:
        mapping = {}

    # 1) Matched rows: take FULL versions for all fields
    if len(matches) > 0:
        matched_part = matches[['movieId_full', 'rating_full', 'timestamp_full', 'userId_full']].copy()
        matched_part.rename(columns={
            'movieId_full': 'movieId',
            'rating_full': 'rating',
            'timestamp_full': 'timestamp',
            'userId_full': 'userId'
        }, inplace=True)
    else:
        matched_part = pd.DataFrame(columns=['movieId', 'rating', 'timestamp', 'userId'])

    # 2) Unmatched rows from small by composite key
    unmatched_rows = small_ratings[~small_ratings['composite_key'].isin(matches['composite_key'])] if len(matches) > 0 else small_ratings.copy()

    # Split unmatched rows into:
    #   a) belong to users that exist in full (we can map their userId via 'mapping')
    #   b) belong to users that don't exist in full (we will assign new userIds)
    unmatched_have_map = unmatched_rows[unmatched_rows['userId'].isin(mapping.keys())].copy()
    unmatched_no_map = unmatched_rows[~unmatched_rows['userId'].isin(mapping.keys())].copy()

    # a) Map userId via mapping (keep small's movie/rating/timestamp)
    if len(unmatched_have_map) > 0:
        unmatched_have_map = unmatched_have_map[['movieId', 'rating', 'timestamp', 'userId']].copy()
        unmatched_have_map['userId'] = unmatched_have_map['userId'].map(mapping)

    # b) Assign new userIds (only for users not in full at all)
    if len(unmatched_no_map) > 0:
        max_user_id = int(full_ratings['userId'].max())
        print(f"\nMaximum userId in full_ratings: {max_user_id}")

        new_ids = {}
        next_id = max_user_id + 1
        for old_uid in sorted(unmatched_users):
            new_ids[int(old_uid)] = next_id
            next_id += 1

        print(f"Assigning new IDs to {len(new_ids)} unmatched users:")
        for i, (old_id, new_id) in enumerate(new_ids.items()):
            if i < 10:
                print(f"  User {old_id} -> User {new_id}")
            elif i == 10:
                print("  ...")

        unmatched_no_map = unmatched_no_map[['movieId', 'rating', 'timestamp', 'userId']].copy()
        unmatched_no_map['userId'] = unmatched_no_map['userId'].map(new_ids).astype('Int64')

        # Drop any rows for which we somehow didn't create a new ID (safety)
        unmatched_no_map = unmatched_no_map.dropna(subset=['userId'])

    # Combine all parts
    parts = [df for df in [matched_part, unmatched_have_map, unmatched_no_map] if df is not None and len(df) > 0]
    final_merged = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=['movieId', 'rating', 'timestamp', 'userId'])

    # Enforce dtypes
    for col in ['movieId', 'timestamp', 'userId']:
        if col in final_merged.columns and final_merged[col].dtype != 'int64':
            final_merged[col] = final_merged[col].astype('int64')
    if 'rating' in final_merged.columns:
        final_merged['rating'] = final_merged['rating'].astype('float32')

    print(f"\nFinal merged dataset shape: {final_merged.shape}")
    print(f"Unique users in merged dataset: {final_merged['userId'].nunique()}")
    print(f"Maximum userId in merged dataset: {final_merged['userId'].max()}")

    # Correct duplicate check: duplicated (userId, movieId, timestamp)
    dup_mask = final_merged.duplicated(subset=['userId', 'movieId', 'timestamp'], keep=False)
    num_dups = int(dup_mask.sum())
    if num_dups > 0:
        print(f"WARNING: Found {num_dups} duplicate (userId, movieId, timestamp) rows.")
        print(final_merged[dup_mask].head())
    else:
        print("✓ No duplicate (userId, movieId, timestamp) rows.")

    # Save
    final_merged.to_csv('../data/merged_ratings.csv', index=False)
    print("\nMerged dataset saved to '../data/merged_ratings.csv'")

    # Extra analysis
    print("\n" + "="*50)
    print("DETAILED ANALYSIS")
    print("="*50)

    print("\nRating distribution in small_ratings:")
    print(small_ratings['rating'].value_counts().sort_index())

    print("\nRating distribution in full_ratings:")
    print(full_ratings['rating'].value_counts().sort_index())

    print("\nRating distribution in merged dataset:")
    print(final_merged['rating'].value_counts().sort_index())

    # Movie overlap analysis
    small_movies = set(small_ratings['movieId'].unique())
    full_movies = set(full_ratings['movieId'].unique())
    matched_movies = set(matches['movieId_full'].unique()) if len(matches) > 0 else set()

    print(f"\nMovie analysis:")
    print(f"Movies in small_ratings: {len(small_movies)}")
    print(f"Movies in full_ratings: {len(full_movies)}")
    print(f"Movies from small_ratings found in full_ratings: {len(matched_movies)}")
    print(f"Percentage of small movies found in full: {len(matched_movies)/len(small_movies)*100:.2f}%")

    return final_merged, matches, unmatched_users

merged_data, matches, unmatched_users = merge_ratings_files()

Loading ratings files...
Small ratings shape: (100004, 4)
Full ratings shape: (26024289, 4)

Analyzing user overlap...
Number of matching records: 99676
Percentage of small ratings found in full ratings: 99.67%

Users in small_ratings: 671
Users from small_ratings found in full_ratings: 670
Users from small_ratings NOT found in full_ratings: 1
Unmatched users: {np.int64(519)}

First 10 unmatched user IDs: [np.int64(519)]

Sample records from unmatched users:
       userId  movieId  rating   timestamp       composite_key
74759     519       50     4.0  1468758676   50_4.0_1468758676
74760     519      104     4.0  1469927080  104_4.0_1469927080
74761     519      111     4.0  1468758621  111_4.0_1468758621
74762     519      216     4.0  1469927082  216_4.0_1469927082
74763     519      223     4.5  1468759080  223_4.5_1468759080
74764     519      235     4.0  1468758651  235_4.0_1468758651
74765     519      260     5.0  1468928013  260_5.0_1468928013
74766     519      288     4.5  1