In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from torch.amp import autocast, GradScaler



In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
import pandas as pd

# Load ratings data
columns = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(r"E:\AISD\Applied_Project\Dataset\ml-100k\u.data", sep="\t", names=columns)
ratings.drop(columns=["timestamp"], inplace=True)  # Drop timestamp if not needed

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
# Define movie columns
movie_columns = ["movie_id", "title", "release_date", "video_release_date", "IMDb_URL"] + \
                ["unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                 "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery",
                 "Romance", "Sci-Fi", "Thriller", "War", "Western"]

# Load movies
movies = pd.read_csv(r"E:\AISD\Applied_Project\Dataset\ml-100k\u.item", sep="|", names=movie_columns, encoding="latin-1")
movies.drop(columns=["video_release_date", "IMDb_URL"], inplace=True)  # Drop unused columns


In [6]:
movies.head()

Unnamed: 0,movie_id,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# Define user columns
user_columns = ["user_id", "age", "gender", "occupation", "zip_code"]

# Load users
users = pd.read_csv(r"E:\AISD\Applied_Project\Dataset\ml-100k\u.user", sep="|", names=user_columns)
users.drop(columns=["zip_code","occupation"], inplace=True)  # Drop zip code if not needed


In [8]:
users.head()

Unnamed: 0,user_id,age,gender
0,1,24,M
1,2,53,F
2,3,23,M
3,4,24,M
4,5,33,F


In [9]:
# Merge ratings with users and movies
data = ratings.merge(users, on="user_id").merge(movies, on="movie_id")


In [10]:
data.head()

Unnamed: 0,user_id,movie_id,rating,age,gender,title,release_date,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,49,M,Kolya (1996),24-Jan-1997,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,39,F,L.A. Confidential (1997),01-Jan-1997,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,25,M,Heavyweights (1994),01-Jan-1994,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,28,M,Legends of the Fall (1994),01-Jan-1994,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,47,M,Jackie Brown (1997),01-Jan-1997,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
data

Unnamed: 0,user_id,movie_id,rating,age,gender,title,release_date,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,49,M,Kolya (1996),24-Jan-1997,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,39,F,L.A. Confidential (1997),01-Jan-1997,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,25,M,Heavyweights (1994),01-Jan-1994,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,28,M,Legends of the Fall (1994),01-Jan-1994,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,47,M,Jackie Brown (1997),01-Jan-1997,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,476,3,13,M,"First Wives Club, The (1996)",14-Sep-1996,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,716,204,5,36,F,Back to the Future (1985),01-Jan-1985,0,0,0,...,0,0,0,0,0,0,1,0,0,0
99997,276,1090,1,21,M,Sliver (1993),01-Jan-1993,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,225,2,47,M,101 Dalmatians (1996),27-Nov-1996,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data['release_date'] = pd.to_datetime(data['release_date'], format='%d-%b-%Y')

# Extract date components
data['year'] = data['release_date'].dt.year
data['month'] = data['release_date'].dt.month
data['day'] = data['release_date'].dt.day
data['weekday'] = data['release_date'].dt.weekday  # 0=Monday, 6=Sunday



In [13]:
encoder = OneHotEncoder(sparse_output = False)
encoded_feature = encoder.fit_transform(data[['gender']])

encoded_df = pd.DataFrame(encoded_feature, columns=encoder.get_feature_names_out(['gender']))

In [14]:
data.columns

Index(['user_id', 'movie_id', 'rating', 'age', 'gender', 'title',
       'release_date', 'unknown', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western', 'year', 'month', 'day', 'weekday'],
      dtype='object')

In [15]:
data = pd.concat([data, encoded_df], axis=1)

In [16]:
data.head()

Unnamed: 0,user_id,movie_id,rating,age,gender,title,release_date,unknown,Action,Adventure,...,Sci-Fi,Thriller,War,Western,year,month,day,weekday,gender_F,gender_M
0,196,242,3,49,M,Kolya (1996),1997-01-24,0,0,0,...,0,0,0,0,1997.0,1.0,24.0,4.0,0.0,1.0
1,186,302,3,39,F,L.A. Confidential (1997),1997-01-01,0,0,0,...,0,1,0,0,1997.0,1.0,1.0,2.0,1.0,0.0
2,22,377,1,25,M,Heavyweights (1994),1994-01-01,0,0,0,...,0,0,0,0,1994.0,1.0,1.0,5.0,0.0,1.0
3,244,51,2,28,M,Legends of the Fall (1994),1994-01-01,0,0,0,...,0,0,1,1,1994.0,1.0,1.0,5.0,0.0,1.0
4,166,346,1,47,M,Jackie Brown (1997),1997-01-01,0,0,0,...,0,0,0,0,1997.0,1.0,1.0,2.0,0.0,1.0


In [17]:
data.columns

Index(['user_id', 'movie_id', 'rating', 'age', 'gender', 'title',
       'release_date', 'unknown', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western', 'year', 'month', 'day', 'weekday',
       'gender_F', 'gender_M'],
      dtype='object')

In [18]:
len(data['movie_id'].unique())

1682

In [19]:
len(data['rating'].unique())

5

In [20]:
X = data.drop(columns=["rating","release_date","title","gender"])
y = data["rating"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [22]:
print(X_train.dtypes)
print(X_test.dtypes)
print(y_train.dtypes)
print(y_test.dtypes)

user_id          int64
movie_id         int64
age              int64
unknown          int64
Action           int64
Adventure        int64
Animation        int64
Children's       int64
Comedy           int64
Crime            int64
Documentary      int64
Drama            int64
Fantasy          int64
Film-Noir        int64
Horror           int64
Musical          int64
Mystery          int64
Romance          int64
Sci-Fi           int64
Thriller         int64
War              int64
Western          int64
year           float64
month          float64
day            float64
weekday        float64
gender_F       float64
gender_M       float64
dtype: object
user_id          int64
movie_id         int64
age              int64
unknown          int64
Action           int64
Adventure        int64
Animation        int64
Children's       int64
Comedy           int64
Crime            int64
Documentary      int64
Drama            int64
Fantasy          int64
Film-Noir        int64
Horror           int

In [23]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Convert to PyTorch tensors explicitly if needed
X_train_tensor = torch.tensor(np.array(X_train), dtype=torch.float32)
y_train_tensor = torch.tensor(np.array(y_train), dtype=torch.float32)

X_test_tensor = torch.tensor(np.array(X_test), dtype=torch.float32)
y_test_tensor = torch.tensor(np.array(y_test), dtype=torch.float32)



In [24]:
# Step 3: Define Neural Network Model
class PointWiseRankNet(nn.Module):
    def __init__(self, input_dim):
        super(PointWiseRankNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Output single predicted rating
        )

    def forward(self, x):
        return self.model(x)

In [25]:
# Initialize model
model = PointWiseRankNet(input_dim=X_train.shape[1])

In [26]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.isnan(X_train_tensor).any())  # Check if any values are NaN
print(torch.isinf(X_train_tensor).any())  # Check if any values are Inf

tensor(True)
tensor(False)


In [27]:
# Step 4: Train the Model
criterion = nn.MSELoss()  # Regression loss
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [28]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# import torch
# torch.cuda.empty_cache()  # If you're using a GPU

In [29]:
# Check if there are NaN or Inf values in the dataset
print("Checking X_train_tensor:")
print(torch.isnan(X_train_tensor).sum(), "NaNs found")
print(torch.isinf(X_train_tensor).sum(), "Infs found")

print("\nChecking y_train_tensor:")
print(torch.isnan(y_train_tensor).sum(), "NaNs found")
print(torch.isinf(y_train_tensor).sum(), "Infs found")


Checking X_train_tensor:
tensor(24) NaNs found
tensor(0) Infs found

Checking y_train_tensor:
tensor(0) NaNs found
tensor(0) Infs found


In [30]:
X_train_tensor = torch.nan_to_num(X_train_tensor, nan=0.0, posinf=1.0, neginf=-1.0)
y_train_tensor = torch.nan_to_num(y_train_tensor, nan=0.0, posinf=1.0, neginf=-1.0)


In [31]:
# if torch.cuda.is_available():
#     print(f"CUDA is available! GPU is working.")
#     print(f"Using GPU: {torch.cuda.get_device_name(0)}")
# else:
#     print("CUDA is not available. GPU is not working.")

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Ensure data is on GPU and reshaped correctly
X_train_tensor = X_train_tensor.to(device, dtype=torch.float32)
y_train_tensor = y_train_tensor.to(device, dtype=torch.float32).view(-1, 1)  # Match model output shape

model = model.to(device)
criterion = criterion.to(device)

scaler = GradScaler(device='cuda')  # Updated syntax for GradScaler

epochs = 500
for epoch in range(epochs):
    optimizer.zero_grad()

    with autocast(device_type='cuda', dtype=torch.float16):  # Updated syntax for autocast
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)

    # Handle NaN loss issue
    if torch.isnan(loss) or torch.isinf(loss):
        print(f"NaN detected at epoch {epoch+1}, stopping training!")
        break

    scaler.scale(loss).backward()

    # Apply gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    scaler.step(optimizer)
    scaler.update()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")



Epoch [100/500], Loss: 1.2450
Epoch [200/500], Loss: 1.2358
Epoch [300/500], Loss: 1.3916
Epoch [400/500], Loss: 1.4463
Epoch [500/500], Loss: 1.2045


In [33]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Step 5: Predict Ratings and Rank Movies
with torch.no_grad():
    # Convert DataFrame to a PyTorch tensor and move to GPU
    X_tensor = torch.tensor(X.astype(np.float32).values, dtype=torch.float32).to(device)

    # Get the predictions
    predicted_ratings = model(X_tensor).cpu().numpy()  # Move back to CPU before converting to NumPy

    # Add predictions to the DataFrame
    data["predicted_rating"] = predicted_ratings

In [34]:
# Step 6: Rank Movies by Predicted Rating
data = data.sort_values(by="predicted_rating", ascending=True)

In [35]:
data["predicted_rating"].isnull().sum()

9

In [36]:
for index, (actual, predicted) in enumerate(zip(y_test, data["predicted_rating"])):
    print(f"Index: {index}, Actual: {actual}, Predicted: {predicted}")

Index: 0, Actual: 3, Predicted: 2.9894003868103027
Index: 1, Actual: 5, Predicted: 2.9906578063964844
Index: 2, Actual: 4, Predicted: 2.993868827819824
Index: 3, Actual: 3, Predicted: 2.9951393604278564
Index: 4, Actual: 1, Predicted: 2.9981603622436523
Index: 5, Actual: 3, Predicted: 2.999405860900879
Index: 6, Actual: 5, Predicted: 3.0089237689971924
Index: 7, Actual: 3, Predicted: 3.021566152572632
Index: 8, Actual: 4, Predicted: 3.03897762298584
Index: 9, Actual: 3, Predicted: 3.04057240486145
Index: 10, Actual: 5, Predicted: 3.043269395828247
Index: 11, Actual: 2, Predicted: 3.054091215133667
Index: 12, Actual: 4, Predicted: 3.057206630706787
Index: 13, Actual: 5, Predicted: 3.0609474182128906
Index: 14, Actual: 4, Predicted: 3.0649783611297607
Index: 15, Actual: 3, Predicted: 3.065486192703247
Index: 16, Actual: 5, Predicted: 3.0660147666931152
Index: 17, Actual: 3, Predicted: 3.0661284923553467
Index: 18, Actual: 2, Predicted: 3.0708563327789307
Index: 19, Actual: 4, Predicted: 

In [37]:
predicted_scores = model(X_test_tensor.to(device)).cpu().detach().numpy()
ranked_indices = np.argsort(-predicted_scores)  # Sort in descending order


In [38]:
print(predicted_scores)

[[3.735426 ]
 [3.979815 ]
 [3.8486176]
 ...
 [3.5113845]
 [3.7779572]
 [4.072229 ]]


In [39]:
ranked_indices = np.argsort(-predicted_scores, axis=0)  # Sort along the correct axis
print(ranked_indices)


[[13858]
 [ 2894]
 [14834]
 ...
 [14864]
 [13035]
 [ 5592]]


In [40]:
print(predicted_scores)

[[3.735426 ]
 [3.979815 ]
 [3.8486176]
 ...
 [3.5113845]
 [3.7779572]
 [4.072229 ]]


In [41]:
import numpy as np

# Check for NaN values
print(np.any(np.isnan(y_test)), np.any(np.isnan(predicted_scores)))


False True


In [42]:
predicted_scores = np.nan_to_num(predicted_scores, nan=0)
from sklearn.metrics import r2_score

r2 = r2_score(y_test, predicted_scores)
print(f"R-squared (R²) Score: {r2}")




R-squared (R²) Score: -0.006673932075500488


In [None]:
# from sklearn.metrics import accuracy_score

# # Convert predicted continuous ratings to discrete categories (e.g., rounding)
# predicted_labels = (model(X_test_tensor.to(device)).cpu().detach().numpy() > 3.0).astype(int)
# true_labels = (y_test_tensor.cpu().numpy() > 3.0).astype(int)  # Assuming 2.0 as a threshold

# # Compute accuracy
# accuracy = accuracy_score(true_labels, predicted_labels)
# print(f"Accuracy: {accuracy:.2%}")

Accuracy: 55.45%
