In [1]:
#!pip install pandas
#!pip install scikit-learn
#!pip install xgboost
#!pip install implicit



In [2]:
import pandas as pd

# Load the ratings data

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies metadata
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', encoding='latin-1',
    names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + [f'genre_{i}' for i in range(19)],
    usecols=[0, 1, 2]  # Load only necessary columns
)

# Load user demographics
users = pd.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# Display the first few rows of each dataset
print("Ratings Data:")
print(ratings.head())

print("\nMovies Metadata:")
print(movies.head())

print("\nUser Demographics:")
print(users.head())


Ratings Data:
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596

Movies Metadata:
   movie_id              title release_date
0         1   Toy Story (1995)  01-Jan-1995
1         2   GoldenEye (1995)  01-Jan-1995
2         3  Four Rooms (1995)  01-Jan-1995
3         4  Get Shorty (1995)  01-Jan-1995
4         5     Copycat (1995)  01-Jan-1995

User Demographics:
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213


In [3]:
# Merge ratings with movies
df = pd.merge(ratings, movies, left_on='item_id', right_on='movie_id')

# Merge with user demographics
df = pd.merge(df, users, on='user_id')

# Display the first few rows
df.head()


Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,title,release_date,age,gender,occupation,zip_code
0,196,242,3,881250949,242,Kolya (1996),24-Jan-1997,49,M,writer,55105
1,196,257,2,881251577,257,Men in Black (1997),04-Jul-1997,49,M,writer,55105
2,196,111,4,881251793,111,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,49,M,writer,55105
3,196,25,4,881251955,25,"Birdcage, The (1996)",08-Mar-1996,49,M,writer,55105
4,196,382,4,881251843,382,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,49,M,writer,55105


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       100000 non-null  int64 
 1   item_id       100000 non-null  int64 
 2   rating        100000 non-null  int64 
 3   timestamp     100000 non-null  int64 
 4   movie_id      100000 non-null  int64 
 5   title         100000 non-null  object
 6   release_date  99991 non-null   object
 7   age           100000 non-null  int64 
 8   gender        100000 non-null  object
 9   occupation    100000 non-null  object
 10  zip_code      100000 non-null  object
dtypes: int64(6), object(5)
memory usage: 8.4+ MB


In [5]:
df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'movie_id', 'title',
       'release_date', 'age', 'gender', 'occupation', 'zip_code'],
      dtype='object')

In [6]:
df.dropna(subset=['release_date'], inplace=True)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99991 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       99991 non-null  int64 
 1   item_id       99991 non-null  int64 
 2   rating        99991 non-null  int64 
 3   timestamp     99991 non-null  int64 
 4   movie_id      99991 non-null  int64 
 5   title         99991 non-null  object
 6   release_date  99991 non-null  object
 7   age           99991 non-null  int64 
 8   gender        99991 non-null  object
 9   occupation    99991 non-null  object
 10  zip_code      99991 non-null  object
dtypes: int64(6), object(5)
memory usage: 9.2+ MB


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])

le_occupation = LabelEncoder()
df['occupation'] = le_occupation.fit_transform(df['occupation'])

# Optional: Convert release_date to datetime and extract features like year, month
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month

# Features: user_id, movie_id, gender, occupation, age, release_year, release_month
X = df[['user_id', 'movie_id', 'gender', 'occupation', 'age', 'release_year', 'release_month']]

# Target: rating
y = df['rating']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # regression with squared error loss
    'max_depth': 5,
    'eta': 0.1,
    'eval_metric': 'rmse',
}

# Train the model
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict and evaluate
y_pred = model.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with XGBoost: {mse}")


Mean Squared Error with XGBoost: 1.009805004088964


In [10]:
import pickle
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Train your XGBoost model as usual
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',  # regression with squared error loss
    'max_depth': 5,
    'eta': 0.1,
    'eval_metric': 'rmse',
}

model = xgb.train(params, dtrain, num_boost_round=100)

# Save the trained model using Pickle
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved successfully.")


Model saved successfully.


In [11]:
import pickle

# Load the model from the saved file
with open('xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)

print("Model loaded successfully.")

# Make predictions with the loaded model
y_pred = model.predict(dtest)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error with loaded XGBoost model: {mse}")


Model loaded successfully.
Mean Squared Error with loaded XGBoost model: 1.009805004088964


In [12]:
X

Unnamed: 0,user_id,movie_id,gender,occupation,age,release_year,release_month
0,196,242,1,20,49,1997,1
1,196,257,1,20,49,1997,7
2,196,111,1,20,49,1996,4
3,196,25,1,20,49,1996,3
4,196,382,1,20,49,1994,1
...,...,...,...,...,...,...,...
99995,873,313,0,0,48,1997,1
99996,873,326,0,0,48,1997,1
99997,873,348,0,0,48,1998,1
99998,873,358,0,0,48,1997,8


In [13]:
import pickle
import pandas as pd
import xgboost as xgb

# Load the trained model
with open('xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Prepare the input data (e.g., for user_id=1, movie_id=50, and some other features)
input_data = pd.DataFrame([{
    'user_id': 196,            # Example user_id
    'movie_id': 257,          # Example movie_id
    'gender': 1,             # Example gender (e.g., 1 for male)
    'occupation': 20,         # Example occupation (encoded numeric value)
    'age': 49,               # Example age
    'release_year': 1997,    # Example movie release year
    'release_month': 1      # Example movie release month
}])

# Convert input data into DMatrix format (required by XGBoost)
dmatrix = xgb.DMatrix(input_data)

# Use the model to make a prediction
predicted_rating = model.predict(dmatrix)

# Print the predicted rating
print(f"Predicted rating: {predicted_rating[0]}")


Predicted rating: 3.5500707626342773
