# Handle missing data in Links.csv

In [1]:
import pandas as pd

df = pd.read_csv('../../datasets/input/links.csv')
data = df.copy()


In [5]:
print("Columns with missing data: \n")
missing_data = []

for col in data.columns:
    missing_count = data[col].isnull().sum()
    if missing_count > 0:
        missing_pct = (missing_count / len(data)) * 100
        missing_data.append({
            'Column': col,
            'Missing Count': missing_count,
            'Missing %': f'{missing_pct:.2f}%'
        })

        print(f"{col:30} | {missing_count:6} missing ({missing_pct:5.2f}%)")

print(f"Total columns: {len(data.columns)}")
print(f"Columns with missing data: {len(missing_data)}")
print(f"Columns without missing data: {len(missing_data)-len(missing_data)}")

Columns with missing data: 

Total columns: 3
Columns with missing data: 0
Columns without missing data: 0


# Step 1: Drop rows with missing tmdbId

In [6]:
# Drop rows with missing tmdbId (cannot be used without this field) because there is only 0.48% low percentage and no bias after deletion
rows_before = len(data)

data = data.dropna(subset=['tmdbId'])

rows_dropped = rows_before - len(data)
print(f"Rows dropped due to missing tmdbId: {rows_dropped}")
print(f"Remaining rows: {len(data)}")
print(f"Percentage retained: {(len(data)/rows_before)*100:.2f}%")

Rows dropped due to missing tmdbId: 0
Remaining rows: 45624
Percentage retained: 100.00%


# Step 2: Convert data types

In [9]:
# Convert tmdbId to integer (for proper merging later)
data['tmdbId'] = data['tmdbId'].astype(int)

print(f"✓ Data types converted")
print(f"\nCurrent data types:")
print(data.dtypes)

print(data.head(10))

✓ Data types converted

Current data types:
movieId    int64
imdbId     int64
tmdbId     int64
dtype: object
   movieId  imdbId  tmdbId
0        1  114709     862
1        2  113497    8844
2        3  113228   15602
3        4  114885   31357
4        5  113041   11862
5        6  113277     949
6        7  114319   11860
7        8  112302   45325
8        9  114576    9091
9       10  113189     710


# Step 3: Verify tmdbId

In [10]:
rows_before_validation = len(data)
valid_tmdb_ids = set(data['tmdbId'].unique())

rows_dropped = rows_before_validation - len(data)
print(f"Rows dropped (tmdbId not in movies_metadata): {rows_dropped}")
print(f"Remaining rows: {len(data)}")

Rows dropped (tmdbId not in movies_metadata): 0
Remaining rows: 45624


# Step 4: Save cleaned links dataset

In [12]:
output_path = '../../datasets/output/cleaned_datasets/cleaned_links.csv'
data.to_csv(output_path, index=False)