# 📄 Prepare Netflix Prize Data
This notebook processes the raw Netflix Prize competition data files and creates cleaned datasets for recommendation modeling.

In [1]:
import pandas as pd
import os
data_folder = "data/"

## Step 1: Process Netflix Prize Raw Files (combined_data_*.txt)

In [5]:
# Folder where raw combined_data files are stored
# List of raw files
raw_files = ['combined_data_1.txt', 'combined_data_2.txt', 'combined_data_3.txt', 'combined_data_4.txt']

# Initialize a list to store all ratings
all_ratings = []

# Read and process each file
for file_name in raw_files:
    file_path = os.path.join(data_folder, file_name)
    with open(file_path, 'r') as file:
        movie_id = None
        for line in file:
            line = line.strip()
            if line.endswith(':'):
                movie_id = int(line[:-1])  # Capture Movie ID
            else:
                customer_id, rating, date = line.split(',')
                all_ratings.append((int(customer_id), int(movie_id), int(rating), date))

# Create DataFrame
ratings_df = pd.DataFrame(all_ratings, columns=['Cust_Id', 'Movie_Id', 'Rating', 'Date'])
print("Ratings dataset shape:", ratings_df.shape)

# Save as CSV
ratings_df.to_csv(os.path.join(data_folder, 'Netflix-Prize-Dataset.csv'), index=False)
print("Saved Netflix-Prize-Dataset.csv")


Ratings dataset shape: (100480507, 4)
Saved Netflix-Prize-Dataset.csv


## Step 2: Filter Ratings Above 4

In [7]:
# Filter ratings >= 4
above_4_df = ratings_df[ratings_df['Rating'] >= 4]

# Save
above_4_df.to_csv(os.path.join(data_folder, 'Above-4-Rating.csv'), index=False)
print("Saved Above-4-Rating.csv")


Saved Above-4-Rating.csv


## Step 3: Merge Ratings with Movie Titles to Create Full Data

In [22]:
import pandas as pd
file_path = 'data/movie_titles.csv'
# List to store cleaned movie records
movies = []
with open(file_path, encoding='latin1') as f:
    for line in f:
        # Split into exactly 3 parts: Movie_Id, Year_Release, Name
        parts = line.strip().split(',', 2)
        if len(parts) == 3:
            movie_id, year, name = parts
            movie_id = movie_id.strip()
            year = year.strip()
            name = name.strip()
            if movie_id.isdigit() and year.isdigit():
                movies.append([int(movie_id), int(year), name])
            else:
                pass
movies_df = pd.DataFrame(movies, columns=['Movie_Id', 'Year_Release', 'Name'])

In [17]:
#ratings_df= pd.read_csv('data/Netflix-Prize-Dataset.csv')

In [24]:
full_data = ratings_df.merge(movies_df, on='Movie_Id', how='left')
full_data.to_csv(os.path.join(data_folder, 'Full_Data.csv'), index=False)
print("Saved Full_Data.csv")

Saved Full_Data.csv
