In [12]:
import pandas as pd
import numpy as np

usb_df_short=pd.read_csv('usb_df_short.csv')

In [13]:
usb_df_short.head()

#  take 1% of the data
usb_df_short=usb_df_short.sample(frac=0.11, random_state=99)

In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler


df = pd.DataFrame(usb_df_short)
# Create user-item matrix
user_item_matrix = df.pivot_table(index='reviewerID', columns='asin', values='overall').fillna(0)

# Normalize ratings
scaler = MinMaxScaler()
user_item_matrix_scaled = pd.DataFrame(scaler.fit_transform(user_item_matrix), 
                                       index=user_item_matrix.index, 
                                       columns=user_item_matrix.columns)

# Compute cosine similarity
cosine_sim = cosine_similarity(user_item_matrix_scaled)
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_item_matrix_scaled.index, columns=user_item_matrix_scaled.index)

def predict_rating(user_id, item_id, top_n, sim_matrix, ratings_matrix):
    similar_users = sim_matrix[user_id].sort_values(ascending=False)[1:top_n+1].index
    user_sim_scores = sim_matrix[user_id].sort_values(ascending=False)[1:top_n+1]
    user_ratings = ratings_matrix.loc[similar_users, item_id]
    weighted_scores = user_ratings.multiply(user_sim_scores).sum() / user_sim_scores.sum()
    return weighted_scores

# K-Folds Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_list = {n: [] for n in [10, 20, 30, 40, 50]}

for train_index, test_index in kf.split(user_item_matrix_scaled):
    # Splitting the dataset
    train, test = user_item_matrix_scaled.iloc[train_index], user_item_matrix_scaled.iloc[test_index]
    
    for N in [10, 20, 30, 40, 50]:
        absolute_errors = []
        
        # Iterate through each user,item pair in the test set
        for index, row in test.iterrows():
            user_id = index
            for item_id in user_item_matrix_scaled.columns:
                # Predict only for items not rated by the user
                if pd.isna(user_item_matrix.loc[user_id, item_id]):
                    continue
                predicted = predict_rating(user_id, item_id, N, cosine_sim_df, user_item_matrix)
                actual = user_item_matrix.loc[user_id, item_id]
                absolute_errors.append(abs(actual - predicted))
                
        # Calculate MAE for this fold and N
        mae = np.mean(absolute_errors)
        mae_list[N].append(mae)

# Average the MAE across all folds for each N
average_mae = {N: np.mean(maes) for N, maes in mae_list.items()}
print(average_mae)


  weighted_scores = user_ratings.multiply(user_sim_scores).sum() / user_sim_scores.sum()


KeyboardInterrupt: 

In [None]:
# Step 1: Transpose the user-item matrix to get the item-user matrix
item_user_matrix = user_item_matrix.T

# Normalize ratings
item_user_matrix_scaled = pd.DataFrame(scaler.fit_transform(item_user_matrix), 
                                       index=item_user_matrix.index, 
                                       columns=item_user_matrix.columns)

# Step 2: Compute cosine similarity between items
item_cosine_sim = cosine_similarity(item_user_matrix_scaled)
item_cosine_sim_df = pd.DataFrame(item_cosine_sim, index=item_user_matrix_scaled.index, columns=item_user_matrix_scaled.index)

# Adjust the predict_rating function for item-item recommendations
def predict_rating_item(item_id, user_id, top_n, sim_matrix, ratings_matrix):
    similar_items = sim_matrix[item_id].sort_values(ascending=False)[1:top_n+1].index
    item_sim_scores = sim_matrix[item_id].sort_values(ascending=False)[1:top_n+1]
    user_ratings = ratings_matrix.loc[similar_items, user_id]
    weighted_scores = user_ratings.multiply(item_sim_scores).sum() / item_sim_scores.sum()
    return weighted_scores

# K-Folds Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_list = {n: [] for n in [10, 20, 30, 40, 50]}

for train_index, test_index in kf.split(item_user_matrix_scaled):
    # Splitting the dataset
    train, test = item_user_matrix_scaled.iloc[train_index], item_user_matrix_scaled.iloc[test_index]
    
    for N in [10, 20, 30, 40, 50]:
        absolute_errors = []
        
        # Iterate through each item,user pair in the test set
        for index, row in test.iterrows():
            item_id = index
            for user_id in item_user_matrix_scaled.columns:
                # Predict only for items not rated by the user
                if pd.isna(user_item_matrix.loc[user_id, item_id]):
                    continue
                predicted = predict_rating_item(item_id, user_id, N, item_cosine_sim_df, item_user_matrix)
                actual = user_item_matrix.loc[user_id, item_id]
                absolute_errors.append(abs(actual - predicted))
                
        # Calculate MAE for this fold and N
        mae = np.mean(absolute_errors)
        mae_list[N].append(mae)

# Average the MAE across all folds for each N
average_mae_item = {N: np.mean(maes) for N, maes in mae_list.items()}
print(average_mae_item)


  weighted_scores = user_ratings.multiply(item_sim_scores).sum() / item_sim_scores.sum()


{10: nan, 20: nan, 30: nan, 40: nan, 50: nan}


In [None]:
import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(list(average_mae_user.keys()), list(average_mae.values()), label='User-User MAE', marker='o')
plt.plot(list(average_mae_item.keys()), list(average_mae_item.values()), label='Item-Item MAE', marker='x')
plt.xlabel('Number of Similar Users/Items (K)')
plt.ylabel('Mean Absolute Error (MAE)')
plt.title('MAE of Recommender Systems')
plt.legend()
plt.show()
