# Handle missing data for ratings

In [None]:
import pandas as pd
df = pd.read_csv('../../datasets/input/ratings.csv', low_memory=False)
data = df.copy()

# Rating analysis

In [None]:
print("Columns with missing data: \n")
missing_data = []

for column in data.columns:
    missing_count = data[column].isnull().sum()
    if missing_count > 0:
        missing_pct = (missing_count / len(data)) * 100
        missing_data.append({
            'Column': column,
            'Missing count': missing_count,
            'Missing percentage': f"{missing_pct:.2f}%",
        })
        print(f"{column:30} | {missing_count} missing {missing_pct:5.2f}%")

print(f"Total columns: {len(data.columns)}")
print(f"Total missing data: {len(missing_data)}")
print(f"Columns without missing data: {len(data.columns) - len(missing_data)}")

# Step 1: Check for duplicate ratings (same user-movie pair)

In [None]:
# Check for duplicate user-movie pairs
duplicates = data.groupby(['userId', 'movieId']).size()
duplicate_pairs = duplicates[duplicates > 1]

print(f"Total ratings: {len(data)}")
print(f"Unique user-movie pairs: {len(duplicates)}")
print(f"Duplicate user-movie pairs: {len(duplicate_pairs)}")

if len(duplicate_pairs) > 0:
    duplicate_percentage = (len(duplicate_pairs) / len(duplicates)) * 100
    print(f"Percentage of duplicates: {duplicate_percentage:.2f}%")
    

    print("\nExamples of duplicate ratings:")
    sample_duplicate = duplicate_pairs.head(3)
    for (userId, movieId), count in sample_duplicate.items():
        print(f"\nUser {userId}, Movie {movieId} - rated {count} times:")
        print(data[(data['userId'] == userId) & (data['movieId'] == movieId)][['userId', 'movieId', 'rating', 'timestamp']])
else:
    print("\nNo duplicate ratings found.")

# Step 2: Save cleaned ratings dataset

In [None]:
# Save cleaned ratings to CSV (after all cleaning steps)
output_path = '../../datasets/output/cleaned_datasets/cleaned_ratings.csv'
data.to_csv(output_path, index=False)
