In [None]:
import os
import pandas as pd
import kagglehub
import zipfile

# Step 1: Download the dataset using kagglehub
path = kagglehub.dataset_download("netflix-inc/netflix-prize-data")
print("Path to dataset files:", path)

# Step 2: Verify the downloaded files
files = os.listdir(path)
print("Files in the dataset:", files)

Path to dataset files: /root/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2
Files in the dataset: ['movie_titles.csv', 'combined_data_3.txt', 'combined_data_2.txt', 'combined_data_1.txt', 'combined_data_4.txt', 'probe.txt', 'qualifying.txt', 'README']


In [None]:
import pandas as pd

# Define file paths for all combined_data files
file_paths = [
    "/content/combined_data_1.txt",
    "/content/combined_data_2.txt",
]

def process_single_file(file_path):
    """
    Processes a single combined_data file and returns a DataFrame.

    Args:
        file_path (str): Path to the combined_data file.

    Returns:
        pd.DataFrame: A DataFrame containing the file's data.
    """
    rows = []
    current_movie_id = None

    # Read the file line by line
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()  # Remove extra whitespace
            if line.endswith(':'):
                # Movie ID line
                current_movie_id = int(line[:-1])  # Remove ':' and convert to int
            else:
                # CustomerID, Rating, Date line
                customer_id, rating, date = line.split(',')
                rows.append([current_movie_id, int(customer_id), int(rating), date])

    # Convert rows to a DataFrame
    df = pd.DataFrame(rows, columns=['MovieID', 'CustomerID', 'Rating', 'Date'])
    return df

def combine_files(file_paths):
    """
    Combines data from multiple combined_data files.

    Args:
        file_paths (list): List of file paths to combine.

    Returns:
        pd.DataFrame: A single combined DataFrame.
    """
    data_frames = []

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        df = process_single_file(file_path)
        data_frames.append(df)

    # Concatenate all DataFrames
    combined_data = pd.concat(data_frames, ignore_index=True)
    return combined_data

# Combine all four combined_data files
combined_data = combine_files(file_paths)

# Display basic information about the combined data
print("Combined Data Overview:")
print(combined_data.info())
print(combined_data.head())

# Save the combined data to a CSV file
output_file = "/content/combined_data_all.csv"  # Adjust path as needed
combined_data.to_csv(output_file, index=False)
print(f"Combined data saved to {output_file}.")


Processing file: /content/combined_data_1.txt
Processing file: /content/combined_data_2.txt
Combined Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51031355 entries, 0 to 51031354
Data columns (total 4 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   MovieID     int64 
 1   CustomerID  int64 
 2   Rating      int64 
 3   Date        object
dtypes: int64(3), object(1)
memory usage: 1.5+ GB
None
   MovieID  CustomerID  Rating        Date
0        1     1488844       3  2005-09-06
1        1      822109       5  2005-05-13
2        1      885013       4  2005-10-19
3        1       30878       4  2005-12-26
4        1      823519       3  2004-05-03
Combined data saved to /content/combined_data_all.csv.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assume `combined_data` is already loaded
# Combined data has columns: ['MovieID', 'CustomerID', 'Rating', 'Date']

import pandas as pd

# Path to your combined_data.csv file
file_path = "combined_data_all.csv"
combined_data = pd.read_csv(file_path, names=["MovieID", "CustomerID", "Rating", "Date"], header=0)

# Step 1: Preprocessing
# 1.1 Normalize Ratings
global_mean = combined_data['Rating'].mean()
combined_data['NormalizedRating'] = combined_data['Rating'] - global_mean

# 1.2 Filter Users and Movies
user_counts = combined_data['CustomerID'].value_counts()
movie_counts = combined_data['MovieID'].value_counts()
filtered_data = combined_data[
    (combined_data['CustomerID'].isin(user_counts[user_counts >= 10].index)) &
    (combined_data['MovieID'].isin(movie_counts[movie_counts >= 5].index))
]

# 1.3 Transform Dates
filtered_data['Date'] = pd.to_datetime(filtered_data['Date'])
filtered_data['DaysSinceFirstRating'] = (filtered_data['Date'] - filtered_data['Date'].min()).dt.days
filtered_data.drop(columns=['Date'], inplace=True)

# Step 2: Splitting the Data
# Training: 80%, Validation: 10%, Testing: 10%
train_data, temp_data = train_test_split(filtered_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Step 3: Matrix Conversion
# User and Movie Encoding
train_data['UserEncoded'] = train_data['CustomerID'].astype('category').cat.codes
train_data['MovieEncoded'] = train_data['MovieID'].astype('category').cat.codes

# Apply same encoding for validation and test sets
val_data['UserEncoded'] = val_data['CustomerID'].astype('category').cat.codes
val_data['MovieEncoded'] = val_data['MovieID'].astype('category').cat.codes
test_data['UserEncoded'] = test_data['CustomerID'].astype('category').cat.codes
test_data['MovieEncoded'] = test_data['MovieID'].astype('category').cat.codes

# Step 4: Define Features and Targets
X_train = train_data[['UserEncoded', 'MovieEncoded', 'DaysSinceFirstRating']]
y_train = train_data['NormalizedRating']
X_val = val_data[['UserEncoded', 'MovieEncoded', 'DaysSinceFirstRating']]
y_val = val_data['NormalizedRating']
X_test = test_data[['UserEncoded', 'MovieEncoded', 'DaysSinceFirstRating']]
y_test = test_data['NormalizedRating']

# Step 5: Regression Model Training
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Step 6: Validation
y_val_pred = regressor.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)
print("Validation RMSE:", val_rmse)

# Step 7: Final Evaluation
y_test_pred = regressor.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
print("Test RMSE:", test_rmse)

# Step 8: Predicting Ratings
example_data = pd.DataFrame({
    'UserEncoded': [0, 1],  # Replace with actual user encoding
    'MovieEncoded': [0, 2],  # Replace with actual movie encoding
    'DaysSinceFirstRating': [5000, 6000]  # Replace with actual days
})
predicted_ratings = regressor.predict(example_data) + global_mean  # Add global mean to revert normalization
print("Predicted Ratings for Example Data:")
print(predicted_ratings)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Date'] = pd.to_datetime(filtered_data['Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['DaysSinceFirstRating'] = (filtered_data['Date'] - filtered_data['Date'].min()).dt.days
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.drop(columns=['Date'], inplace=True)


Validation RMSE: 1.0797547607087052
Test RMSE: 1.080299223916577
Predicted Ratings for Example Data:
[4.47356234 4.74571588]


# New Section