In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from surprise import Dataset, Reader, SVD
from train_valid_test_loader import load_train_valid_test_datasets

# Load user and movie information
user_info = pd.read_csv("../data_movie_lens_100k/user_info.csv")
movie_info = pd.read_csv("../data_movie_lens_100k/movie_info.csv")

# Drop unnecessary columns
user_info = user_info.drop(columns=['orig_user_id'])
movie_info = movie_info.drop(columns=['orig_item_id'])

# Ensure proper column naming
user_info.columns = ['user_id', 'age', 'is_male']
movie_info.columns = ['item_id', 'title', 'release_year']

# Load datasets
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

# Convert tuples to a `surprise` dataset
def tuple_to_surprise_dataset(tupl):
    ratings_dict = {
        "userID": tupl[0],
        "itemID": tupl[1],
        "rating": tupl[2],
    }
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)
    return dataset

# Train SVD model using the training set
trainset = tuple_to_surprise_dataset(train_tuple).build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Extract user and item factors
user_factors = {trainset.to_raw_uid(uid): algo.pu[uid] for uid in range(trainset.n_users)}
item_factors = {trainset.to_raw_iid(iid): algo.qi[iid] for iid in range(trainset.n_items)}

# Define a zero vector to handle missing factors
zero_vector = np.zeros(algo.n_factors)

# Movie Title Feature Extraction using TF-IDF
titles = movie_info['title'].fillna('')
tfidf = TfidfVectorizer(max_features=100)
title_vectors = tfidf.fit_transform(titles).toarray()
title_dict = {item: title_vectors[i] for i, item in enumerate(movie_info['item_id'])}

# Normalize or bucket movie years
movie_info['year_normalized'] = (movie_info['release_year'] - movie_info['release_year'].min()) / (movie_info['release_year'].max() - movie_info['release_year'].min())
year_dict = {item: movie_info.loc[movie_info['item_id'] == item, 'year_normalized'].values[0] for item in movie_info['item_id']}

# Function to create feature vectors
def create_features(user_id, item_id):
    user_emb = user_factors.get(user_id, zero_vector)
    item_emb = item_factors.get(item_id, zero_vector)

    # Add demographic features
    user_info_row = user_info[user_info['user_id'] == user_id]
    if not user_info_row.empty:
        is_male = user_info_row.iloc[0]['is_male']
        age = user_info_row.iloc[0]['age']
    else:
        is_male = 0
        age = 0

    # Add movie title and year features
    title_emb = title_dict.get(item_id, np.zeros(100))
    year_emb = year_dict.get(item_id, 0)

    return np.concatenate((user_emb, item_emb, [is_male, age], title_emb, [year_emb]))

# Prepare the final DataFrame for training
def prepare_features_and_labels(tupl, threshold=4.5):
    ratings_df = pd.DataFrame({
        'userID': tupl[0],
        'itemID': tupl[1],
        'rating': tupl[2]
    })
    ratings_df['features'] = ratings_df.apply(lambda row: create_features(row['userID'], row['itemID']), axis=1)
    ratings_df['label'] = (ratings_df['rating'] >= threshold).astype(int)
    X = np.stack(ratings_df['features'])
    y = ratings_df['label']
    return X, y

# Create features and labels
X_train, y_train = prepare_features_and_labels(train_tuple)
X_valid, y_valid = prepare_features_and_labels(valid_tuple)
X_test, y_test = prepare_features_and_labels(test_tuple)

# Combine training and validation for model selection
X_combined = np.vstack((X_train, X_valid))
y_combined = np.hstack((y_train, y_valid))

# Model selection using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
clf.fit(X_combined, y_combined)

# Best model evaluation
best_model = clf.best_estimator_
y_pred = best_model.predict(X_test)

print(f"Best Parameters: {clf.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


ValueError: Length mismatch: Expected axis has 4 elements, new values have 3 elements