In [1]:
import pandas as pd

# Load the ratings data

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies metadata
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', encoding='latin-1',
    names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + [f'genre_{i}' for i in range(19)],
    usecols=[0, 1, 2]  # Load only necessary columns
)

# Load user demographics
users = pd.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# Display the first few rows of each dataset
print("Ratings Data:")
print(ratings.head())

print("\nMovies Metadata:")
print(movies.head())

print("\nUser Demographics:")
print(users.head())


Ratings Data:
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596

Movies Metadata:
   movie_id              title release_date
0         1   Toy Story (1995)  01-Jan-1995
1         2   GoldenEye (1995)  01-Jan-1995
2         3  Four Rooms (1995)  01-Jan-1995
3         4  Get Shorty (1995)  01-Jan-1995
4         5     Copycat (1995)  01-Jan-1995

User Demographics:
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213


In [2]:
# Merge ratings with movies
df = pd.merge(ratings, movies, left_on='item_id', right_on='movie_id')

# Merge with user demographics
df = pd.merge(df, users, on='user_id')

# Display the first few rows
df.head()


Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,release_date,age,gender,occupation,zip_code
0,196,242,3,881250949,242,Kolya (1996),24-Jan-1997,49,M,writer,55105
1,186,302,3,891717742,302,L.A. Confidential (1997),01-Jan-1997,39,F,executive,0
2,22,377,1,878887116,377,Heavyweights (1994),01-Jan-1994,25,M,writer,40206
3,244,51,2,880606923,51,Legends of the Fall (1994),01-Jan-1994,28,M,technician,80525
4,166,346,1,886397596,346,Jackie Brown (1997),01-Jan-1997,47,M,educator,55113


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       100000 non-null  int64 
 1   item_id       100000 non-null  int64 
 2   rating        100000 non-null  int64 
 3   timestamp     100000 non-null  int64 
 4   movie_id      100000 non-null  int64 
 5   title         100000 non-null  object
 6   release_date  99991 non-null   object
 7   age           100000 non-null  int64 
 8   gender        100000 non-null  object
 9   occupation    100000 non-null  object
 10  zip_code      100000 non-null  object
dtypes: int64(6), object(5)
memory usage: 8.4+ MB


In [4]:
df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'movie_id', 'title',
       'release_date', 'age', 'gender', 'occupation', 'zip_code'],
      dtype='object')

In [5]:
df.dropna(subset=['release_date'], inplace=True)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99991 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       99991 non-null  int64 
 1   item_id       99991 non-null  int64 
 2   rating        99991 non-null  int64 
 3   timestamp     99991 non-null  int64 
 4   movie_id      99991 non-null  int64 
 5   title         99991 non-null  object
 6   release_date  99991 non-null  object
 7   age           99991 non-null  int64 
 8   gender        99991 non-null  object
 9   occupation    99991 non-null  object
 10  zip_code      99991 non-null  object
dtypes: int64(6), object(5)
memory usage: 9.2+ MB


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])

le_occupation = LabelEncoder()
df['occupation'] = le_occupation.fit_transform(df['occupation'])

# Optional: Convert release_date to datetime and extract features like year, month
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month

# Features: user_id, movie_id, gender, occupation, age, release_year, release_month
X = df[['user_id', 'movie_id', 'gender', 'occupation', 'age', 'release_year', 'release_month']]

# Target: rating
y = df['rating']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train a simple regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 1.1826583604158254


In [9]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# KNN Regressor with the number of neighbors set to 5
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred = knn.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with KNN: {mse}")


Mean Squared Error with KNN: 1.2216750837541879


In [8]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # regression with squared error loss
    'max_depth': 5,
    'eta': 0.1,
    'eval_metric': 'rmse',
}

# Train the model
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict and evaluate
y_pred = model.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with XGBoost: {mse}")


Mean Squared Error with XGBoost: 1.0168109092361401


In [11]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Initialize the MLP regressor with 1 hidden layer of 50 units
mlp = MLPRegressor(hidden_layer_sizes=(50,), max_iter=1000)

# Train the model
mlp.fit(X_train, y_train)

# Predict and evaluate
y_pred = mlp.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with MLPRegressor: {mse}")


Mean Squared Error with MLPRegressor: 1.8811736017746454


In [12]:
import implicit
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import mean_squared_error

# Create a sparse matrix from the data
interaction_matrix = sp.coo_matrix((df['rating'], (df['user_id'], df['movie_id'])))

# Initialize ALS model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1, iterations=30)

# Train the model
model.fit(interaction_matrix)

# Prepare predicted ratings
user_item_pred = []

# Predict ratings for each user-item pair
for user, item in zip(df['user_id'], df['movie_id']):
    # Get user and item latent vectors
    user_vector = model.user_factors[user]
    item_vector = model.item_factors[item]
    
    # Calculate predicted rating (dot product of user and item vectors)
    predicted_rating = np.dot(user_vector, item_vector)
    user_item_pred.append(predicted_rating)

# Convert predictions to numpy array for evaluation
user_item_pred = np.array(user_item_pred)

# Calculate MSE
mse = mean_squared_error(df['rating'], user_item_pred)
print(f"Mean Squared Error with ALS: {mse}")


  check_blas_config()


  0%|          | 0/30 [00:00<?, ?it/s]

Mean Squared Error with ALS: 8.833043994760914


Collecting implicit
  Obtaining dependency information for implicit from https://files.pythonhosted.org/packages/7c/25/48964efed207b60b2d5b2855161638e4f368f5db332b57f62b6cd16fb591/implicit-0.7.2-cp311-cp311-win_amd64.whl.metadata
  Downloading implicit-0.7.2-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Downloading implicit-0.7.2-cp311-cp311-win_amd64.whl (750 kB)
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
    --------------------------------------- 10.2/750.8 kB ? eta -:--:--
   - ------------------------------------- 30.7/750.8 kB 330.3 kB/s eta 0:00:03
   ---- ---------------------------------- 92.2/750.8 kB 751.6 kB/s eta 0:00:01
   ------------ --------------------------- 225.3/750.8 kB 1.3 MB/s eta 0:00:01
   ------------------------- -------------- 471.0/750.8 kB 2.1 MB/s eta 0:00:01
   ---------------------------------------  747.5/750.8 kB 2.9 MB/s eta 0:00:01
   ---------------------------------------- 750.8/750.8 kB 2.4 MB/s eta 0:00:00
Installing c

In [10]:
import pickle
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Train your XGBoost model as usual
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',  # regression with squared error loss
    'max_depth': 5,
    'eta': 0.1,
    'eval_metric': 'rmse',
}

model = xgb.train(params, dtrain, num_boost_round=100)

# Save the trained model using Pickle
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved successfully.")


Model saved successfully.


In [12]:
import pickle

# Load the model from the saved file
with open('xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)

print("Model loaded successfully.")

# Make predictions with the loaded model
y_pred = model.predict(dtest)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with loaded XGBoost model: {mse}")


Model loaded successfully.
Mean Squared Error with loaded XGBoost model: 1.0168109092361401


In [15]:
# Install the dependencies
!pip install --no-cache-dir -r requirements.txt


ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'
