In [None]:
import pandas as pd
import numpy as np

In [None]:
links_small = pd.read_csv('../data/links_small.csv')
links_small.info()

In [None]:
links = pd.read_csv('../data/links.csv')
links.info()

In [None]:

# unique ID sets
# small_idx = pd.Index(links_small['movieId'].dropna().unique())
# full_idx  = pd.Index(links['movieId'].dropna().unique())

small_idx = pd.Index(links_small['imdbId'].dropna().unique())
full_idx  = pd.Index(links['imdbId'].dropna().unique())

small_idx = pd.Index(links_small['tmdbId'].dropna().unique())
full_idx  = pd.Index(links['tmdbId'].dropna().unique())

# IDs in small but not in full
only_in_small = small_idx.difference(full_idx)
# IDs in full but not in small
only_in_full  = full_idx.difference(small_idx)

print(f"Only in small (count {len(only_in_small)}):", only_in_small.to_list()[:20], "...")
print(f"Only in full  (count {len(only_in_full)}):",  only_in_full.to_list()[:20],  "...")

# If you want NumPy arrays instead:
missing_from_full  = np.setdiff1d(small_idx.values, full_idx.values)
missing_from_small = np.setdiff1d(full_idx.values, small_idx.values)




In [None]:


def _normalize_links(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # keep only the id columns that exist
    keep = [c for c in ['movieId', 'imdbId', 'tmdbId'] if c in df.columns]
    df = df[keep]

    # to numeric, allow NA; use pandas nullable Int64 (keeps NaN)
    for c in keep:
        df[c] = pd.to_numeric(df[c], errors='coerce').astype('Int64')

    # drop exact duplicate rows
    df = df.drop_duplicates()
    return df

def merge_links(links: pd.DataFrame, links_small: pd.DataFrame):
    full  = _normalize_links(links)
    small = _normalize_links(links_small)

    # quick uniqueness diagnostics
    diag = {
        'movieId_only_in_small': len(pd.Index(small['movieId'].dropna().unique()).difference(pd.Index(full['movieId'].dropna().unique()))),
        'movieId_only_in_full' : len(pd.Index(full['movieId'].dropna().unique()).difference(pd.Index(small['movieId'].dropna().unique()))),
        'imdb_only_in_small'   : len(pd.Index(small['imdbId'].dropna().unique()).difference(pd.Index(full['imdbId'].dropna().unique()))),
        'imdb_only_in_full'    : len(pd.Index(full['imdbId'].dropna().unique()).difference(pd.Index(small['imdbId'].dropna().unique()))),
        'tmdb_only_in_small'   : len(pd.Index(small['tmdbId'].dropna().unique()).difference(pd.Index(full['tmdbId'].dropna().unique()))),
        'tmdb_only_in_full'    : len(pd.Index(full['tmdbId'].dropna().unique()).difference(pd.Index(small['tmdbId'].dropna().unique()))),
    }

    merged = full.merge(
        small, how='outer', on='movieId', suffixes=('_full', '_small'), indicator=True
    )

    # conflicts (both present and different)
    for col in ['imdbId', 'tmdbId']:
        merged[f'{col}_conflict'] = (
            merged[f'{col}_full'].notna() &
            merged[f'{col}_small'].notna() &
            (merged[f'{col}_full'] != merged[f'{col}_small'])
        )

    # resolved columns: prefer FULL, else take SMALL
    merged['imdbId_resolved'] = merged['imdbId_full'].combine_first(merged['imdbId_small'])
    merged['tmdbId_resolved'] = merged['tmdbId_full'].combine_first(merged['tmdbId_small'])

    # provenance & final shape
    merged['source'] = merged['_merge'].map({'left_only':'full', 'right_only':'small', 'both':'both'})
    conflicts = merged[(merged['imdbId_conflict']) | (merged['tmdbId_conflict'])].copy()

    final = (
        merged[['movieId', 'imdbId_resolved', 'tmdbId_resolved', 'source']]
        .rename(columns={'imdbId_resolved':'imdbId', 'tmdbId_resolved':'tmdbId'})
        .sort_values('movieId')
        .reset_index(drop=True)
    )

    # optional: create zero-padded IMDb tt-ids for APIs (keep original int columns as-is)
    # final['imdb_tt'] = final['imdbId'].apply(lambda x: f"tt{int(x):07d}" if pd.notna(x) else pd.NA)

    # sanity checks: after merging, the union should be covered
    # assert set(final['movieId'].dropna()) == set(full['movieId'].dropna()).union(set(small['movieId'].dropna()))

    return final, conflicts, diag

# ===== run it =====
final_links, id_conflicts, diagnostics = merge_links(links, links_small)

print("Diagnostics:", diagnostics)
print("Final rows:", len(final_links))
print("Conflicts (rows where small vs full disagree on imdb/tmdb):", len(id_conflicts))
print(id_conflicts[['movieId','imdbId_full','imdbId_small','tmdbId_full','tmdbId_small']].head(10))

# Save if you want:
final_links.to_csv("links_merged.csv", index=False)

links_merged = pd.read_csv("links_merged.csv")
links_merged.info()
# id_conflicts.to_csv("links_id_conflicts_for_review.csv", index=False)
