### Team

Shiref Khaled Elhalawany -  221100944

Karim Ashraf Elsayed - 221100391

Bassant Kamal Mesilam - 221100244 

# Section 2: Neighbhorhood CF Filters
## 3.2.2. Part 2: Item-Based CF

### Data Loading and Preparation

This section imports the required libraries, defines the file paths, and loads the Electronics ratings dataset into a DataFrame.  
It then builds two lookup structures:
- `item_user_ratings`: maps each item to the users who rated it and their rating values  
- `user_ratings_list`: stores a list of all ratings given by each user  

Finally, it loads the item co-rating pairs from `3_1_13_co_rating_items.csv` into `df_co_items` for later item-based similarity calculations.


In [2]:
import pandas as pd
import math
import csv
import sys
import os

ratings_file = '../../dataset/Electronics.csv'
co_rating_items_file = '../../results/3_1_13_co_rating_items.csv'

In [3]:
print("Loading ratings data...")
df_ratings = pd.read_csv(ratings_file, header=None, names=["ItemID", "UserID", "Rating", "Timestamp"])
print(f"Loaded {len(df_ratings)} ratings.")

item_user_ratings = {}

user_ratings_list = {}

for index, row in df_ratings.iterrows():
    user = row['UserID']
    item = row['ItemID']
    rating = float(row['Rating'])
    
    if item not in item_user_ratings:
        item_user_ratings[item] = {}
    item_user_ratings[item][user] = rating
    
    if user not in user_ratings_list:
        user_ratings_list[user] = []
    user_ratings_list[user].append(rating)

print("Ratings dictionaries created.")

Loading ratings data...
Loaded 20994353 ratings.
Ratings dictionaries created.


In [4]:
print("Loading co-rating items...")
df_co_items = pd.read_csv(co_rating_items_file)
print(f"Loaded {len(df_co_items)} pairs to calculate similarity for.")
print(df_co_items.head())

Loading co-rating items...
Loaded 73 pairs to calculate similarity for.
   TargetItem   OtherItem  CommonUsers
0  B000JE4594  B00000J060            1
1  B000JE4594  B00009MVK8            1
2  B000JE4594  B0001656FW            1
3  B000JE4594  B00029U0W2            1
4  B000JE4594  B0002H7F3G            1


## Case Study 1

### 3.2.2.1.1

This section first computes the average rating for each user using `compute_user_averages`.  

It then calculates Adjusted Cosine Similarity between item pairs in `df_co_items` by subtracting each user’s average rating (user mean-centering) before computing similarity.  

The results, including `TargetItem`, `OtherItem`, `Similarity`, and `CommonUsers`, are saved to:  
`3_2_2_1_1_item_similarities.csv`

In [5]:
def compute_user_averages(user_ratings_list):
    user_avgs = {}
    for user, ratings in user_ratings_list.items():
        if len(ratings) > 0:
            user_avgs[user] = sum(ratings) / len(ratings)
        else:
            user_avgs[user] = 0.0
    return user_avgs

In [6]:
def calculate_adjusted_cosine_similarity(item1, item2, item_user_ratings, user_avgs):
    u1_users = item_user_ratings.get(item1, {})
    u2_users = item_user_ratings.get(item2, {})
    
    common_users = set(u1_users.keys()) & set(u2_users.keys())
    
    if not common_users:
        return 0.0, 0
        
    numerator = 0.0
    sum_sq_1 = 0.0
    sum_sq_2 = 0.0
    
    for user in common_users:
        r1 = u1_users[user]
        r2 = u2_users[user]
        user_avg = user_avgs[user]
        
        r1_centered = r1 - user_avg
        r2_centered = r2 - user_avg
        
        numerator += r1_centered * r2_centered
        sum_sq_1 += r1_centered ** 2
        sum_sq_2 += r2_centered ** 2
        
    norm1 = sum_sq_1 ** 0.5
    norm2 = sum_sq_2 ** 0.5
    
    if norm1 == 0 or norm2 == 0:
        return 0.0, len(common_users)
        
    similarity = numerator / (norm1 * norm2)
    return similarity, len(common_users)

In [7]:
print("Calculating user averages...")
user_avgs = compute_user_averages(user_ratings_list)
print(f"Calculated averages for {len(user_avgs)} users.")

print("Calculating item similarities...")
similarities = []

for index, row in df_co_items.iterrows():
    target_item = row['TargetItem']
    other_item = row['OtherItem']
    
    sim, num_common = calculate_adjusted_cosine_similarity(target_item, other_item, item_user_ratings, user_avgs)
    
    similarities.append({
        'TargetItem': target_item,
        'OtherItem': other_item,
        'Similarity': round(sim, 2),
        'CommonUsers': num_common
    })

print("Similarity calculation complete.")

Calculating user averages...
Calculated averages for 9838676 users.
Calculating item similarities...
Similarity calculation complete.


In [8]:
print("Saving item-based adjusted cosine similarity results...")
df_results = pd.DataFrame(similarities)
output_file = '../../results/3_2_2_1_1_item_similarities.csv'
df_results.to_csv(output_file, index=False)
print("Results saved successfully.")
print(df_results.head())

Saving item-based adjusted cosine similarity results...
Results saved successfully.
   TargetItem   OtherItem  Similarity  CommonUsers
0  B000JE4594  B00000J060        -1.0            1
1  B000JE4594  B00009MVK8         1.0            1
2  B000JE4594  B0001656FW        -1.0            1
3  B000JE4594  B00029U0W2        -1.0            1
4  B000JE4594  B0002H7F3G        -1.0            1


### 3.2.2.1.2

This section groups similarity results by each target item, sorts them in descending order of Adjusted Cosine similarity, and selects the top 20% most similar items for each target item.  

The selected item pairs are saved to:  
`3_2_2_1_2_top_similar_items.csv`

In [9]:
def get_top_n_similar_items(df_similarities, n_percentage=0.20, similarity_col='Similarity'):
    top_similar_items = []

    for target_item, group in df_similarities.groupby('TargetItem'):
        sorted_group = group.sort_values(by=similarity_col, ascending=False)

        n_top = math.ceil(len(sorted_group) * n_percentage)

        top_items = sorted_group.head(n_top)

        top_similar_items.append(top_items)

    if top_similar_items:
        return pd.concat(top_similar_items)
    else:
        return pd.DataFrame()


In [10]:
print("Identifying top 20% most similar items...")

df_top_similar_items = get_top_n_similar_items(df_results,n_percentage=0.20,similarity_col='Similarity')

print(f"Identified {len(df_top_similar_items)} top similar item pairs.")

top_items_output = '../../results/3_2_2_1_2_top_similar_items.csv'
print(f"Saving top 20% similar items to {top_items_output}...")
df_top_similar_items.to_csv(top_items_output, index=False)
print("Top similar items saved successfully.")
print(df_top_similar_items.head())

Identifying top 20% most similar items...
Identified 15 top similar item pairs.
Saving top 20% similar items to ../../results/3_2_2_1_2_top_similar_items.csv...
Top similar items saved successfully.
    TargetItem   OtherItem  Similarity  CommonUsers
37  B000JE4594  B00EZJZFDE         1.0            1
31  B000JE4594  B009GUL1VM         1.0            1
47  B000JE4594  B013HNYVCE         1.0            1
21  B000JE4594  B0043T7FXE         1.0            1
20  B000JE4594  B003S4ZJW4         1.0            1


### 3.2.2.1.3

This section defines an item-based prediction function that estimates unknown user ratings for target items using ratings on similar items weighted by their similarity scores. 
 
It then uses the top similar items (`df_top_similar_items`) and adjusted cosine similarity to generate predictions and saves them to:  
`3_2_2_1_3_item_based_predictions.csv`

In [11]:
def predict_ratings_item_based(df_top_items, item_user_ratings, sim_col='Similarity'):
    predictions = []

    for target_item, group in df_top_items.groupby('TargetItem'):
        target_item_users = set(item_user_ratings.get(target_item, {}).keys())

        candidate_users = set()
        for _, row in group.iterrows():
            other_item = row['OtherItem']
            other_item_users = item_user_ratings.get(other_item, {}).keys()
            candidate_users.update(other_item_users)

        unknown_users = candidate_users - target_item_users

        for user in unknown_users:
            numerator = 0.0
            denominator = 0.0

            for _, row in group.iterrows():
                other_item = row['OtherItem']

                similarity = row[sim_col]

                rating = item_user_ratings.get(other_item, {}).get(user)

                if rating is not None:
                    numerator += similarity * rating
                    denominator += abs(similarity)

            if denominator > 0:
                predicted_rating = numerator / denominator
                predictions.append({
                    'UserID': user,
                    'Item': target_item,
                    'PredictedRating': round(predicted_rating, 2),
                    'SimilarityType': sim_col
                })

    return pd.DataFrame(predictions)

In [12]:
print("Predicting unknown ratings using item-based adjusted cosine similarity...")

df_predictions_items = predict_ratings_item_based(
    df_top_similar_items, 
    item_user_ratings,
    sim_col='Similarity'    
)

print(f"Generated {len(df_predictions_items)} item-based predictions.")

predictions_items_file = '../../results/3_2_2_1_3_item_based_predictions.csv'
print(f"Saving item-based predictions to {predictions_items_file}...")
df_predictions_items.to_csv(predictions_items_file, index=False)
print("Item-based predictions saved successfully.")
print(df_predictions_items.head())

Predicting unknown ratings using item-based adjusted cosine similarity...
Generated 27558 item-based predictions.
Saving item-based predictions to ../../results/3_2_2_1_3_item_based_predictions.csv...
Item-based predictions saved successfully.
           UserID        Item  PredictedRating SimilarityType
0  A1D8WAI5CU9GLM  B000JE4594              4.0     Similarity
1  A2Z2H3G61MKN14  B000JE4594              3.0     Similarity
2  A1TKDREDLKIS4N  B000JE4594              5.0     Similarity
3  A3MRUWTJ8S3NM1  B000JE4594              5.0     Similarity
4  A2D0R3W70XF80I  B000JE4594              5.0     Similarity


### 3.2.2.1.4

This section computes Discounted Similarity (DS) for item-based collaborative filtering by scaling the adjusted cosine similarity with a discount factor based on how many users co-rated the item pair relative to a threshold β (30% of users who rated the target item).  

The resulting discounted similarities are saved to:  
`3_2_2_1_4_discounted_similarity_items.csv`

In [13]:
def calculate_discounted_similarity_items(df_similarities, item_user_ratings, beta_pct=0.3, sim_col='Similarity'):
    ds_list = []

    for target_item, group in df_similarities.groupby('TargetItem'):
        num_users_for_target = len(item_user_ratings.get(target_item, {}))

        beta = math.ceil(num_users_for_target * beta_pct)

        for _, row in group.iterrows():
            other_item = row['OtherItem']
            similarity = row[sim_col]
            common_users = row['CommonUsers']

            if beta > 0:
                df = min(common_users / beta, 1.0)
            else:
                df = 1.0

            ds = similarity * df

            ds_entry = row.to_dict()
            ds_entry['DiscountFactor'] = round(df, 2)
            ds_entry['DiscountedSimilarity'] = round(ds, 2)

            ds_list.append(ds_entry)

    return pd.DataFrame(ds_list)

In [14]:
print("Calculating Discounted Similarity for item-based adjusted cosine...")

df_ds_items = calculate_discounted_similarity_items(
    df_results,    
    item_user_ratings,
    beta_pct=0.3,
    sim_col='Similarity'    
)

print("Item-based DS calculation complete.")
print(df_ds_items.head())

ds_items_output_file = '../../results/3_2_2_1_4_discounted_similarity_items.csv'
print(f"Saving item-based Discounted Similarity to {ds_items_output_file}...")
df_ds_items.to_csv(ds_items_output_file, index=False)
print("Item-based Discounted Similarity saved successfully.")

Calculating Discounted Similarity for item-based adjusted cosine...
Item-based DS calculation complete.
   TargetItem   OtherItem  Similarity  CommonUsers  DiscountFactor  \
0  B000JE4594  B00000J060        -1.0            1            0.11   
1  B000JE4594  B00009MVK8         1.0            1            0.11   
2  B000JE4594  B0001656FW        -1.0            1            0.11   
3  B000JE4594  B00029U0W2        -1.0            1            0.11   
4  B000JE4594  B0002H7F3G        -1.0            1            0.11   

   DiscountedSimilarity  
0                 -0.11  
1                  0.11  
2                 -0.11  
3                 -0.11  
4                 -0.11  
Saving item-based Discounted Similarity to ../../results/3_2_2_1_4_discounted_similarity_items.csv...
Item-based Discounted Similarity saved successfully.


### 3.2.2.1.5

This step selects the top 20% most similar items per target item using **Discounted Similarity** values computed earlier.  
The output is saved to:  
`3_2_2_1_5_top_similar_items_ds.csv`

In [15]:
print("Identifying top 20% similar items based on Discounted Similarity...")

df_top_ds_items = get_top_n_similar_items(
    df_ds_items,                  
    n_percentage=0.20,
    similarity_col='DiscountedSimilarity'
)

print(f"Identified {len(df_top_ds_items)} top similar item pairs (DS).")

ds_items_output_file = '../../results/3_2_2_1_5_top_similar_items_ds.csv'
print(f"Saving top 20% DS items to {ds_items_output_file}...")
df_top_ds_items.to_csv(ds_items_output_file, index=False)

print("Top DS items saved successfully.")
print(df_top_ds_items.head())

Identifying top 20% similar items based on Discounted Similarity...
Identified 15 top similar item pairs (DS).
Saving top 20% DS items to ../../results/3_2_2_1_5_top_similar_items_ds.csv...
Top DS items saved successfully.
    TargetItem   OtherItem  Similarity  CommonUsers  DiscountFactor  \
37  B000JE4594  B00EZJZFDE         1.0            1            0.11   
31  B000JE4594  B009GUL1VM         1.0            1            0.11   
47  B000JE4594  B013HNYVCE         1.0            1            0.11   
21  B000JE4594  B0043T7FXE         1.0            1            0.11   
20  B000JE4594  B003S4ZJW4         1.0            1            0.11   

    DiscountedSimilarity  
37                  0.11  
31                  0.11  
47                  0.11  
21                  0.11  
20                  0.11  


### 3.2.2.1.6

This step predicts unknown user ratings for target items using **Discounted Similarity** from item-based collaborative filtering.  
The prediction method considers only top 20% similar items per target item.  

The output is saved to:  
`3_2_2_1_6_item_based_predictions_ds.csv`

In [16]:
print("Predicting unknown ratings using Discounted Similarity (item-based)...")

df_predictions_ds_items = predict_ratings_item_based(
    df_top_ds_items,     
    item_user_ratings,
    sim_col='DiscountedSimilarity'
)

print(f"Generated {len(df_predictions_ds_items)} item-based predictions (DS).")

predictions_ds_items_file = '../../results/3_2_2_1_6_item_based_predictions_ds.csv'
print(f"Saving item-based DS-based predictions to {predictions_ds_items_file}...")
df_predictions_ds_items.to_csv(predictions_ds_items_file, index=False)

print("Item-based DS predictions saved successfully.")
print(df_predictions_ds_items.head())

Predicting unknown ratings using Discounted Similarity (item-based)...
Generated 27558 item-based predictions (DS).
Saving item-based DS-based predictions to ../../results/3_2_2_1_6_item_based_predictions_ds.csv...
Item-based DS predictions saved successfully.
           UserID        Item  PredictedRating        SimilarityType
0  A1D8WAI5CU9GLM  B000JE4594              4.0  DiscountedSimilarity
1  A2Z2H3G61MKN14  B000JE4594              3.0  DiscountedSimilarity
2  A1TKDREDLKIS4N  B000JE4594              5.0  DiscountedSimilarity
3  A3MRUWTJ8S3NM1  B000JE4594              5.0  DiscountedSimilarity
4  A2D0R3W70XF80I  B000JE4594              5.0  DiscountedSimilarity


### Compare Results

### 3.2.2.1.7

This step compares the top similar item lists obtained using standard adjusted cosine similarity and Discounted Similarity (DS) for each target item. 
 
It measures the overlap between both lists, saves detailed overlapping item pairs with similarity information to:  
`3_2_2_1_7_overlap_items.csv`  
and saves the summary comparison statistics per target item to:  
`3_2_2_1_7_comparison_results_items.csv`

In [17]:
print("Comparing top items lists...")

comparison_results_items = []
overlap_item_pairs = []

all_target_items = set(df_top_similar_items['TargetItem']).union(
    set(df_top_ds_items['TargetItem'])
)

for target_item in all_target_items:
    top_items_std = set(
        df_top_similar_items[df_top_similar_items['TargetItem'] == target_item]['OtherItem']
    )
    top_items_ds = set(
        df_top_ds_items[df_top_ds_items['TargetItem'] == target_item]['OtherItem']
    )
    
    overlap = top_items_std.intersection(top_items_ds)
    overlap_count = len(overlap)

    for other_item in overlap:
        overlap_item_pairs.append({
            'TargetItem': target_item,
            'OtherItem': other_item
        })

    comparison_results_items.append({
        'TargetItem': target_item,
        'StandardCount': len(top_items_std),
        'DSCount': len(top_items_ds),
        'OverlapCount': overlap_count,
        'OverlapPercentage': round(overlap_count / len(top_items_std) * 100, 2)
                            if len(top_items_std) > 0 else 0
    })

df_items_comparison = pd.DataFrame(comparison_results_items)
print(f"Average Overlap Percentage (items): {df_items_comparison['OverlapPercentage'].mean():.2f}%")
print(df_items_comparison.head())

df_items_overlap_keys = pd.DataFrame(overlap_item_pairs)

print(f"Total overlapping (TargetItem, OtherItem) pairs: {len(df_items_overlap_keys)}")
print(df_items_overlap_keys.head())

df_items_overlap_details = pd.merge(
    df_items_overlap_keys,
    df_ds_items,  
    on=['TargetItem', 'OtherItem'],
    how='left'
)

overlap_items_output_file = '../../results/3_2_2_1_7_overlap_items.csv'
df_items_overlap_details.to_csv(overlap_items_output_file, index=False)
print(f"Intersection items with details saved successfully to {overlap_items_output_file}.")
print(df_items_overlap_details.head())

comparison_items_output_file = '../../results/3_2_2_1_7_comparison_results_items.csv'
df_items_comparison.to_csv(comparison_items_output_file, index=False)
print("Item-based comparison results saved successfully.")

Comparing top items lists...
Average Overlap Percentage (items): 100.00%
   TargetItem  StandardCount  DSCount  OverlapCount  OverlapPercentage
0  B00L38GD2W              2        2             2              100.0
1  B000JE4594             13       13            13              100.0
Total overlapping (TargetItem, OtherItem) pairs: 15
   TargetItem   OtherItem
0  B00L38GD2W  B013XUS8WK
1  B00L38GD2W  B009657UWQ
2  B000JE4594  B000FJD5IA
3  B000JE4594  B00009V2XJ
4  B000JE4594  B013HNYVCE
Intersection items with details saved successfully to ../../results/3_2_2_1_7_overlap_items.csv.
   TargetItem   OtherItem  Similarity  CommonUsers  DiscountFactor  \
0  B00L38GD2W  B013XUS8WK         1.0            1            0.25   
1  B00L38GD2W  B009657UWQ         1.0            1            0.25   
2  B000JE4594  B000FJD5IA         1.0            1            0.11   
3  B000JE4594  B00009V2XJ         1.0            1            0.11   
4  B000JE4594  B013HNYVCE         1.0            1         

### 3.2.2.1.8

This step compares item-based rating predictions generated using standard adjusted cosine similarity and Discounted Similarity (DS).  

It performs safety checks, merges common `(UserID, Item)` predictions, computes the difference between both methods, and saves the comparison results to:  
`3_2_2_1_8_item_pred_comparison.csv`

In [18]:
print("Comparing item-based rating predictions...")

required_cols = {'UserID', 'Item', 'PredictedRating'}

if df_predictions_items.empty or df_predictions_ds_items.empty:
    print("One of the prediction dataframes is EMPTY. Skipping comparison.")
    print(f"df_predictions_items empty? {df_predictions_items.empty}")
    print(f"df_predictions_ds_items empty? {df_predictions_ds_items.empty}")
else:

    missing_std = required_cols - set(df_predictions_items.columns)
    missing_ds  = required_cols - set(df_predictions_ds_items.columns)

    if missing_std:
        print(f"Missing columns in df_predictions_items: {missing_std}")
    elif missing_ds:
        print(f"Missing columns in df_predictions_ds_items: {missing_ds}")
    else:

        df_item_pred_comparison = pd.merge(
            df_predictions_items[['UserID', 'Item', 'PredictedRating']],
            df_predictions_ds_items[['UserID', 'Item', 'PredictedRating']],
            on=['UserID', 'Item'],
            suffixes=('_Std', '_DS'),
            how='inner'
        )

        if df_item_pred_comparison.empty:
            print("No overlapping (UserID, Item) predictions found.")
        else:
            df_item_pred_comparison['Difference'] = (
                df_item_pred_comparison['PredictedRating_Std']
                - df_item_pred_comparison['PredictedRating_DS']
            )

            print(f"Compared {len(df_item_pred_comparison)} common item-based predictions.")
            print(f"Average Difference: {df_item_pred_comparison['Difference'].abs().mean():.4f}")
            print(df_item_pred_comparison.head())

            pred_comp_items_file = '../../results/3_2_2_1_8_item_pred_comparison.csv'
            df_item_pred_comparison.to_csv(pred_comp_items_file, index=False)

            print("Item-based predictions comparison saved successfully.")

Comparing item-based rating predictions...
Compared 27558 common item-based predictions.
Average Difference: 0.0000
           UserID        Item  PredictedRating_Std  PredictedRating_DS  \
0  A1D8WAI5CU9GLM  B000JE4594                  4.0                 4.0   
1  A2Z2H3G61MKN14  B000JE4594                  3.0                 3.0   
2  A1TKDREDLKIS4N  B000JE4594                  5.0                 5.0   
3  A3MRUWTJ8S3NM1  B000JE4594                  5.0                 5.0   
4  A2D0R3W70XF80I  B000JE4594                  5.0                 5.0   

   Difference  
0         0.0  
1         0.0  
2         0.0  
3         0.0  
4         0.0  
Item-based predictions comparison saved successfully.


## Case Study 2

### 3.2.2.2.1
This section defines a function to compute Pearson Correlation Coefficient (PCC) between item pairs based on common user ratings, then applies it to all co-rated item pairs in `df_co_items`.  

The resulting item-based Pearson similarities are saved to:  
`3_2_2_2_1_item_similarities_pearson.csv`

In [19]:
def calculate_pearson_similarity_items(item1, item2, item_user_ratings):
    i1_users = item_user_ratings.get(item1, {})
    i2_users = item_user_ratings.get(item2, {})

    common_users = set(i1_users.keys()) & set(i2_users.keys())

    if not common_users:
        return 0.0, 0

    i1_ratings = [i1_users[u] for u in common_users]
    i2_ratings = [i2_users[u] for u in common_users]

    mean1 = sum(i1_ratings) / len(i1_ratings)
    mean2 = sum(i2_ratings) / len(i2_ratings)

    numerator = 0.0
    denom1 = 0.0
    denom2 = 0.0

    for user in common_users:
        d1 = i1_users[user] - mean1
        d2 = i2_users[user] - mean2

        numerator += d1 * d2
        denom1 += d1 ** 2
        denom2 += d2 ** 2

    if denom1 == 0 or denom2 == 0:
        return 0.0, len(common_users)

    similarity = numerator / ((denom1 ** 0.5) * (denom2 ** 0.5))

    return similarity, len(common_users)

In [20]:
print("Calculating item-based Pearson (PCC) similarities...")
pcc_item_similarities = []

for index, row in df_co_items.iterrows():
    target_item = row['TargetItem']
    other_item = row['OtherItem']

    sim, num_common = calculate_pearson_similarity_items(
        target_item,
        other_item,
        item_user_ratings
    )

    pcc_item_similarities.append({
        'TargetItem': target_item,
        'OtherItem': other_item,
        'PearsonSimilarity': round(sim, 2),
        'CommonUsers': num_common
    })

print("Item-based Pearson similarity calculation complete.")

Calculating item-based Pearson (PCC) similarities...
Item-based Pearson similarity calculation complete.


In [21]:
print("Saving item-based Pearson similarity results...")
df_pcc_item_results = pd.DataFrame(pcc_item_similarities)
output_pcc_items = '../../results/3_2_2_2_1_item_similarities_pearson.csv'
df_pcc_item_results.to_csv(output_pcc_items, index=False)
print("Item-based Pearson similarities saved successfully.")
print(df_pcc_item_results.head())

Saving item-based Pearson similarity results...
Item-based Pearson similarities saved successfully.
   TargetItem   OtherItem  PearsonSimilarity  CommonUsers
0  B000JE4594  B00000J060                0.0            1
1  B000JE4594  B00009MVK8                0.0            1
2  B000JE4594  B0001656FW                0.0            1
3  B000JE4594  B00029U0W2                0.0            1
4  B000JE4594  B0002H7F3G                0.0            1


### 3.2.2.2.2

This step selects the top 20% most similar items per target item using **Pearson Correlation** values. This helps identify items that users rate in a strongly related manner — even if their rating scales differ.

The output is saved to:  
`3_2_2_2_2_top_similar_items_pearson.csv`

In [22]:
print("Identifying top 20% similar items based on Pearson similarity...")

df_top_pcc_items = get_top_n_similar_items(
    df_pcc_item_results,    
    n_percentage=0.20,
    similarity_col='PearsonSimilarity'
)

print(f"Identified {len(df_top_pcc_items)} top similar item pairs (Pearson).")

top_pcc_items_output_file = '../../results/3_2_2_2_2_top_similar_items_pearson.csv'
print(f"Saving top 20% Pearson similar items to {top_pcc_items_output_file}...")

df_top_pcc_items.to_csv(top_pcc_items_output_file, index=False)

print("Top 20% Pearson items saved successfully.")
print(df_top_pcc_items.head())

Identifying top 20% similar items based on Pearson similarity...
Identified 15 top similar item pairs (Pearson).
Saving top 20% Pearson similar items to ../../results/3_2_2_2_2_top_similar_items_pearson.csv...
Top 20% Pearson items saved successfully.
    TargetItem   OtherItem  PearsonSimilarity  CommonUsers
0   B000JE4594  B00000J060                0.0            1
33  B000JE4594  B00B86IKXO                0.0            1
35  B000JE4594  B00BVUNZUU                0.0            1
36  B000JE4594  B00C30HUQ2                0.0            1
37  B000JE4594  B00EZJZFDE                0.0            1


### 3.2.2.2.3

Using the top 20% of most similar items (based on **Pearson correlation**), we predict how users would rate items they have not rated before.

Predicted results are saved to:  
`3_2_2_2_3_item_based_predictions_pearson.csv`


In [23]:
print("Predicting unknown ratings using item-based Pearson similarity...")

df_predictions_pcc_items = predict_ratings_item_based(
    df_top_pcc_items,      
    item_user_ratings,
    sim_col='PearsonSimilarity'
)

print(f"Generated {len(df_predictions_pcc_items)} item-based PCC predictions.")

predictions_pcc_items_file = '../../results/3_2_2_2_3_item_based_predictions_pearson.csv'
print(f"Saving item-based PCC predictions to {predictions_pcc_items_file}...")

df_predictions_pcc_items.to_csv(predictions_pcc_items_file, index=False)

print("Item-based PCC predictions saved successfully.")
print(df_predictions_pcc_items.head())

Predicting unknown ratings using item-based Pearson similarity...
Generated 0 item-based PCC predictions.
Saving item-based PCC predictions to ../../results/3_2_2_2_3_item_based_predictions_pearson.csv...
Item-based PCC predictions saved successfully.
Empty DataFrame
Columns: []
Index: []


### 3.2.2.2.4

In this step, we apply a **Discount Factor (DF)** to the Pearson similarity scores.  
The discount depends on how many users rated both items compared to the total number of users who rated the target item.

This reduces the impact of similarities computed over very few co-ratings  
— improving reliability.

The output is saved to:  
`3_2_2_2_4_discounted_similarity_pearson_items.csv`

In [29]:
print("Calculating item-based Discounted Pearson Similarity...")

df_ds_pcc_items = calculate_discounted_similarity_items(
    df_pcc_item_results,   
    item_user_ratings,
    beta_pct=0.3,
    sim_col='PearsonSimilarity'
)

print("Item-based PCC DS calculation complete.")
print(df_ds_pcc_items.head())

ds_pcc_items_output_file = '../../results/3_2_2_2_4_discounted_similarity_pearson_items.csv'
print(f"Saving item-based Discounted Pearson Similarity to {ds_pcc_items_output_file}...")

df_ds_pcc_items.to_csv(ds_pcc_items_output_file, index=False)

print("Item-based Discounted Pearson Similarity saved successfully.")

Calculating item-based Discounted Pearson Similarity...
Item-based PCC DS calculation complete.
   TargetItem   OtherItem  PearsonSimilarity  CommonUsers  DiscountFactor  \
0  B000JE4594  B00000J060                0.0            1            0.11   
1  B000JE4594  B00009MVK8                0.0            1            0.11   
2  B000JE4594  B0001656FW                0.0            1            0.11   
3  B000JE4594  B00029U0W2                0.0            1            0.11   
4  B000JE4594  B0002H7F3G                0.0            1            0.11   

   DiscountedSimilarity  
0                   0.0  
1                   0.0  
2                   0.0  
3                   0.0  
4                   0.0  
Saving item-based Discounted Pearson Similarity to ../../results/3_2_2_2_4_discounted_similarity_pearson_items.csv...
Item-based Discounted Pearson Similarity saved successfully.


### 3.2.2.2.5

This step selects the **top 20% most similar items** for each target item using **Discounted Pearson Similarity** scores.  

The results are saved to:  
`3_2_2_2_5_top_similar_items_pearson_ds.csv`

In [25]:
print("Identifying top 20% similar items based on Discounted Pearson Similarity...")

df_top_ds_pcc_items = get_top_n_similar_items(
    df_ds_pcc_items,           
    similarity_col='DiscountedSimilarity'
)

print(f"Identified {len(df_top_ds_pcc_items)} top similar item pairs (PCC DS).")

top_ds_pcc_items_output_file = '../../results/3_2_2_2_5_top_similar_items_pearson_ds.csv'
print(f"Saving top 20% PCC DS items to {top_ds_pcc_items_output_file}...")

df_top_ds_pcc_items.to_csv(top_ds_pcc_items_output_file, index=False)

print("Top PCC DS items saved successfully.")
print(df_top_ds_pcc_items.head())

Identifying top 20% similar items based on Discounted Pearson Similarity...
Identified 15 top similar item pairs (PCC DS).
Saving top 20% PCC DS items to ../../results/3_2_2_2_5_top_similar_items_pearson_ds.csv...
Top PCC DS items saved successfully.
    TargetItem   OtherItem  PearsonSimilarity  CommonUsers  DiscountFactor  \
0   B000JE4594  B00000J060                0.0            1            0.11   
33  B000JE4594  B00B86IKXO                0.0            1            0.11   
35  B000JE4594  B00BVUNZUU                0.0            1            0.11   
36  B000JE4594  B00C30HUQ2                0.0            1            0.11   
37  B000JE4594  B00EZJZFDE                0.0            1            0.11   

    DiscountedSimilarity  
0                    0.0  
33                   0.0  
35                   0.0  
36                   0.0  
37                   0.0  


### 3.2.2.2.6

In this step, we use **Discounted Pearson Similarity** to estimate how a user would rate items they have not rated yet.  

The algorithm looks at similar items (top 20%) that the user has already rated and applies a weighted prediction formula.

The results are saved to:  
`3_2_2_2_6_item_based_predictions_pearson_ds.csv`

In [26]:
print("Predicting unknown ratings using Discounted Pearson Similarity (item-based)...")

df_predictions_ds_pcc_items = predict_ratings_item_based(
    df_top_ds_pcc_items,     
    item_user_ratings,
    sim_col='DiscountedSimilarity'
)

print(f"Generated {len(df_predictions_ds_pcc_items)} item-based PCC-DS predictions.")

predictions_ds_pcc_items_file = '../../results/3_2_2_2_6_item_based_predictions_pearson_ds.csv'
print(f"Saving item-based PCC-DS predictions to {predictions_ds_pcc_items_file}...")

df_predictions_ds_pcc_items.to_csv(predictions_ds_pcc_items_file, index=False)

print("Item-based PCC-DS predictions saved successfully.")
print(df_predictions_ds_pcc_items.head())

Predicting unknown ratings using Discounted Pearson Similarity (item-based)...
Generated 0 item-based PCC-DS predictions.
Saving item-based PCC-DS predictions to ../../results/3_2_2_2_6_item_based_predictions_pearson_ds.csv...
Item-based PCC-DS predictions saved successfully.
Empty DataFrame
Columns: []
Index: []


### 3.2.2.2.7

This step compares the **top 20% similar items** selected using:
- **Pearson Similarity**
- **Discounted Pearson Similarity**

We calculate overlap between both item sets per target item and measure how much the ranking changed after discounting similarity.

The results are saved to:

1️-Overlap item pairs with full DS details:  
`3_2_2_2_7_overlap_items_pearson.csv`

2️- Summary comparison table (counts + percentages):  
`3_2_2_2_7_comparison_results_items_pearson.csv`

In [27]:
print("Comparing top items lists (PCC)...")

pcc_item_comparison_results = []
pcc_item_overlap_pairs = []

all_target_items_pcc = set(df_top_pcc_items['TargetItem']).union(
    set(df_top_ds_pcc_items['TargetItem'])
)

for target_item in all_target_items_pcc:
    top_items_pcc = set(
        df_top_pcc_items[df_top_pcc_items['TargetItem'] == target_item]['OtherItem']
    )
    top_items_ds_pcc = set(
        df_top_ds_pcc_items[df_top_ds_pcc_items['TargetItem'] == target_item]['OtherItem']
    )
    
    overlap = top_items_pcc.intersection(top_items_ds_pcc)
    overlap_count = len(overlap)

    for other_item in overlap:
        pcc_item_overlap_pairs.append({
            'TargetItem': target_item,
            'OtherItem': other_item
        })

    pcc_item_comparison_results.append({
        'TargetItem': target_item,
        'PCCCount': len(top_items_pcc),
        'PCC_DSCount': len(top_items_ds_pcc),
        'OverlapCount': overlap_count,
        'OverlapPercentage': round(overlap_count / len(top_items_pcc) * 100, 2)
                            if len(top_items_pcc) > 0 else 0
    })

df_pcc_items_comparison = pd.DataFrame(pcc_item_comparison_results)
print(f"Average Overlap Percentage (Item-based PCC): {df_pcc_items_comparison['OverlapPercentage'].mean():.2f}%")
print(df_pcc_items_comparison.head())

df_pcc_items_overlap_keys = pd.DataFrame(pcc_item_overlap_pairs)

print(f"Total overlapping (TargetItem, OtherItem) pairs (PCC): {len(df_pcc_items_overlap_keys)}")
print(df_pcc_items_overlap_keys.head())

df_pcc_items_overlap_details = pd.merge(
    df_pcc_items_overlap_keys,
    df_ds_pcc_items,   
    on=['TargetItem', 'OtherItem'],
    how='left'
)

overlap_pcc_items_output_file = '../../results/3_2_2_2_7_overlap_items_pearson.csv'
df_pcc_items_overlap_details.to_csv(overlap_pcc_items_output_file, index=False)
print(f"Intersection items (PCC) with details saved successfully to {overlap_pcc_items_output_file}.")
print(df_pcc_items_overlap_details.head())

comparison_pcc_items_output_file = '../../results/3_2_2_2_7_comparison_results_items_pearson.csv'
df_pcc_items_comparison.to_csv(comparison_pcc_items_output_file, index=False)
print("Item-based PCC comparison results saved successfully.")

Comparing top items lists (PCC)...
Average Overlap Percentage (Item-based PCC): 100.00%
   TargetItem  PCCCount  PCC_DSCount  OverlapCount  OverlapPercentage
0  B00L38GD2W         2            2             2              100.0
1  B000JE4594        13           13            13              100.0
Total overlapping (TargetItem, OtherItem) pairs (PCC): 15
   TargetItem   OtherItem
0  B00L38GD2W  B000CSSHG4
1  B00L38GD2W  B000R9BMVU
2  B000JE4594  B00XIHAHKA
3  B000JE4594  B00I8Y6V9E
4  B000JE4594  B00FSB799Q
Intersection items (PCC) with details saved successfully to ../../results/3_2_2_2_7_overlap_items_pearson.csv.
   TargetItem   OtherItem  PearsonSimilarity  CommonUsers  DiscountFactor  \
0  B00L38GD2W  B000CSSHG4                0.0            1            0.25   
1  B00L38GD2W  B000R9BMVU                0.0            1            0.25   
2  B000JE4594  B00XIHAHKA                0.0            1            0.11   
3  B000JE4594  B00I8Y6V9E                0.0            1            

### 3.2.2.2.8

This step compares the **predicted ratings** generated using:
- **Pearson Similarity** (standard)
- **Discounted Pearson Similarity (PCC-DS)**

Only predictions that exist in **both** matrices (same User–Item pairs) are compared.  
The comparison reveals change in prediction accuracy after applying the discount factor.

Output file contains:
- UserID, Item
- Predicted Rating (PCC)
- Predicted Rating (PCC-DS)
- Absolute Difference

Saved to:  
`3_2_2_2_8_item_pred_comparison_pearson.csv`

In [28]:
print("Comparing item-based rating predictions (PCC)...")

if df_predictions_pcc_items.empty or df_predictions_ds_pcc_items.empty:
    print("One of the prediction dataframes is EMPTY. Skipping comparison.")
    print(f"df_predictions_pcc_items empty? {df_predictions_pcc_items.empty}")
    print(f"df_predictions_ds_pcc_items empty? {df_predictions_ds_pcc_items.empty}")
else:
    required_cols = {'Item', 'UserID', 'PredictedRating'}
    
    missing_pcc = required_cols - set(df_predictions_pcc_items.columns)
    missing_ds  = required_cols - set(df_predictions_ds_pcc_items.columns)

    if missing_pcc:
        print(f"Missing columns in df_predictions_pcc_items: {missing_pcc}")
    elif missing_ds:
        print(f"Missing columns in df_predictions_ds_pcc_items: {missing_ds}")
    else:
        df_item_pred_comparison_pcc = pd.merge(
            df_predictions_pcc_items[['UserID', 'Item', 'PredictedRating']],
            df_predictions_ds_pcc_items[['UserID', 'Item', 'PredictedRating']],
            on=['UserID', 'Item'],
            suffixes=('_PCC', '_PCC_DS'),
            how='inner'
        )

        if df_item_pred_comparison_pcc.empty:
            print("No common (UserID, Item) predictions to compare.")
        else:
            df_item_pred_comparison_pcc['Difference'] = (
                df_item_pred_comparison_pcc['PredictedRating_PCC']
                - df_item_pred_comparison_pcc['PredictedRating_PCC_DS']
            )

            print(f"Compared {len(df_item_pred_comparison_pcc)} item-based predictions (PCC).")
            print(f"Average Difference: {df_item_pred_comparison_pcc['Difference'].abs().mean():.4f}")
            print(df_item_pred_comparison_pcc.head())

            output_file = '../../results/3_2_2_2_8_item_pred_comparison_pearson.csv'
            df_item_pred_comparison_pcc.to_csv(output_file, index=False)
            print(f"PCC item-based prediction comparison saved to {output_file}")

Comparing item-based rating predictions (PCC)...
One of the prediction dataframes is EMPTY. Skipping comparison.
df_predictions_pcc_items empty? True
df_predictions_ds_pcc_items empty? True
