### Team

Shiref Khaled Elhalawany -  221100944

Karim Ashraf Elsayed - 221100391

Bassant Kamal Mesilam - 221100244 

# Section 2: Neighbhorhood CF Filters
## 3.2.1. Part 1: User-Based CF

### Data Loading and Preparation

This section imports the required libraries, defines the input file paths, and loads the Electronics ratings dataset into a DataFrame.  

It then builds a nested dictionary `user_item_ratings` that maps each user to the items they rated and their corresponding ratings.  

Finally, it loads the co-rating user pairs from `3_1_13_co_rating_users.csv` into a DataFrame for later similarity calculations and prints a brief summary.


In [1]:
import pandas as pd
import math
import csv
import sys
import os

ratings_file = '../../dataset/Electronics.csv'
co_rating_users_file = '../../results/3_1_13_co_rating_users.csv'

In [2]:
print("Loading ratings data...")
df_ratings = pd.read_csv(ratings_file, header=None, names=["ItemID", "UserID", "Rating", "Timestamp"])
print(f"Loaded {len(df_ratings)} ratings.")

user_item_ratings = {}
for index, row in df_ratings.iterrows():
    user = row['UserID']
    item = row['ItemID']
    rating = float(row['Rating'])
    
    if user not in user_item_ratings:
        user_item_ratings[user] = {}
    user_item_ratings[user][item] = rating

print("Ratings dictionary created.")

Loading ratings data...
Loaded 20994353 ratings.
Ratings dictionary created.


In [3]:
print("Loading co-rating users...")
df_co_users = pd.read_csv(co_rating_users_file)
print(f"Loaded {len(df_co_users)} pairs to calculate similarity for.")
print(df_co_users.head())

Loading co-rating users...
Loaded 16305 pairs to calculate similarity for.
       TargetUser       OtherUser  CommonItems
0  A1ER5AYS3FQ9O3   AAP7PPBU72QFM            1
1  A1ER5AYS3FQ9O3   AIMPBO9K5SQ5X            1
2  A1ER5AYS3FQ9O3  A2QCVDCCZ3ABAC            1
3  A1ER5AYS3FQ9O3  A1C0Y8AFKTIRWY            1
4  A1ER5AYS3FQ9O3  A3M96C2MSACALP            1


## Case Study 1

### 3.2.1.1.1

This section defines a function `calculate_raw_cosine_similarity` to compute the cosine similarity between two users based on their common rated items.  

It then iterates over all user pairs in `df_co_users`, calculates their similarity and number of common items, and stores the results in a DataFrame.  

The full results are saved to:  
`3_2_1_1_1_user_similarities.csv`  

A filtered version, keeping only pairs with more than one common item, is saved to:  
`3_2_1_1_1_user_similarities_filtered.csv`

In [4]:
def calculate_raw_cosine_similarity(user1, user2, user_ratings):

    u1_items = user_ratings.get(user1, {})
    u2_items = user_ratings.get(user2, {})

    common_items = set(u1_items.keys()) & set(u2_items.keys())

    if not common_items:
        return 0.0, 0

    dot_product = 0.0
    sum_sq_1 = 0.0
    sum_sq_2 = 0.0

    for item in common_items:
        r1 = u1_items[item]
        r2 = u2_items[item]

        dot_product += r1 * r2
        sum_sq_1 += r1 ** 2
        sum_sq_2 += r2 ** 2

    norm1 = sum_sq_1** 0.5
    norm2 = sum_sq_2** 0.5

    if norm1 == 0 or norm2 == 0:
        return 0.0, len(common_items)

    similarity = dot_product / (norm1 * norm2)
    return similarity, len(common_items)

In [5]:
print("Calculating similarities...")
similarities = []

for index, row in df_co_users.iterrows():
    target_user = row['TargetUser']
    other_user = row['OtherUser']
    
    sim, num_common = calculate_raw_cosine_similarity(target_user, other_user, user_item_ratings)
    
    similarities.append({
        'TargetUser': target_user,
        'OtherUser': other_user,
        'Similarity': round(sim, 2),
        'CommonItems': num_common
    })

print("Similarity calculation complete.")

Calculating similarities...
Similarity calculation complete.


In [44]:
print(f"Saving results")
df_results = pd.DataFrame(similarities)
df_results.to_csv('../../results/3_2_1_1_1_user_similarities.csv', index=False)
print("Results saved successfully.")
print(df_results.head())

df_results_filtered = df_results[df_results['CommonItems'] > 1]
print(df_results_filtered.head())
df_results_filtered.to_csv('../../results/3_2_1_1_1_user_similarities_filtered.csv', index=False)
print("Results saved successfully.")

Saving results
Results saved successfully.
       TargetUser       OtherUser  Similarity  CommonItems
0  A1ER5AYS3FQ9O3   AAP7PPBU72QFM         1.0            1
1  A1ER5AYS3FQ9O3   AIMPBO9K5SQ5X         1.0            1
2  A1ER5AYS3FQ9O3  A2QCVDCCZ3ABAC         1.0            1
3  A1ER5AYS3FQ9O3  A1C0Y8AFKTIRWY         1.0            1
4  A1ER5AYS3FQ9O3  A3M96C2MSACALP         1.0            1
         TargetUser       OtherUser  Similarity  CommonItems
52   A1ER5AYS3FQ9O3  A2JCJJNY43QQIV        1.00            3
55   A1ER5AYS3FQ9O3   AM9APPMIE1BHZ        1.00            3
87   A1ER5AYS3FQ9O3  A37PV5GMP2ILJC        1.00            2
93   A1ER5AYS3FQ9O3  A259HHYBP6ZNJ3        0.99            2
124  A1ER5AYS3FQ9O3   A680RUE1FDO8B        0.99            2
Results saved successfully.


### 3.2.1.1.2

This section defines a function that groups similarity results by each target user, sorts them by cosine similarity in descending order, and selects the top 20% most similar users for each target user.  

The selected top pairs are combined into a single DataFrame and saved to:  
`3_2_1_1_2_top_similar_users.csv`

In [None]:
def get_top_n_similar_users(df_similarities, n_percentage=0.20, similarity_col='Similarity'):
    top_similar_users = []

    for target_user, group in df_similarities.groupby('TargetUser'):
        sorted_group = group.sort_values(by=similarity_col, ascending=False)
        
        n_top = math.ceil(len(sorted_group) * n_percentage)
        
        top_users = sorted_group.head(n_top)
        
        top_similar_users.append(top_users)

    if top_similar_users:
        return pd.concat(top_similar_users)
    else:
        return pd.DataFrame()

In [8]:
print("Identifying top 20% similar users...")

df_top_similar_users = get_top_n_similar_users(df_results_filtered, n_percentage=0.20, similarity_col='Similarity')

print(f"Identified {len(df_top_similar_users)} top similar users pairs.")

top_output_file = '../../results/3_2_1_1_2_top_similar_users.csv'
print(f"Saving top 20% similar users to {top_output_file}...")
df_top_similar_users.to_csv(top_output_file, index=False)
print("Top similar users saved successfully.")
print(df_top_similar_users.head())

Identifying top 20% similar users...
Identified 21 top similar users pairs.
Saving top 20% similar users to ../../results/3_2_1_1_2_top_similar_users.csv...
Top similar users saved successfully.
          TargetUser       OtherUser  Similarity  CommonItems
52    A1ER5AYS3FQ9O3  A2JCJJNY43QQIV         1.0            3
2688  A1ER5AYS3FQ9O3  A1KEK09ZA6J9P8         1.0            2
3589  A1ER5AYS3FQ9O3  A36AIK1DQPSRNT         1.0            2
3560  A1ER5AYS3FQ9O3  A2CVXUY1EYQGGA         1.0            3
3526  A1ER5AYS3FQ9O3   A5K5DIDKAML5C         1.0            2


### 3.2.1.1.3

This section defines a function that predicts ratings for items not yet rated by each target user, using a weighted average of ratings from their top similar users (based on the chosen similarity column).  

It generates a predictions DataFrame and saves the results to:  
`3_2_1_1_3_predictions.csv`

In [9]:
def predict_ratings(df_top_users, user_item_ratings, sim_col='Similarity'):
    predictions = []

    for target_user, group in df_top_users.groupby('TargetUser'):
        target_user_items = set(user_item_ratings.get(target_user, {}).keys())
        
        candidate_items = set()
        for _, row in group.iterrows():
            other_user = row['OtherUser']
            other_user_items = user_item_ratings.get(other_user, {}).keys()
            candidate_items.update(other_user_items)
        
        unknown_items = candidate_items - target_user_items
        
        for item in unknown_items:
            numerator = 0.0
            denominator = 0.0
            
            for _, row in group.iterrows():
                other_user = row['OtherUser']
                
                similarity = row[sim_col]
                
                rating = user_item_ratings.get(other_user, {}).get(item)
                
                if rating is not None:
                    numerator += similarity * rating
                    denominator += abs(similarity)
            
            if denominator > 0:
                predicted_rating = numerator / denominator
                predictions.append({
                    'TargetUser': target_user,
                    'Item': item,
                    'PredictedRating': round(predicted_rating, 2),
                    'SimilarityType': sim_col
                })
                
    return pd.DataFrame(predictions)

In [10]:
print("Predicting unknown ratings...")

df_predictions = predict_ratings(df_top_similar_users, user_item_ratings, sim_col='Similarity')

print(f"Generated {len(df_predictions)} predictions.")

predictions_file = '../../results/3_2_1_1_3_predictions.csv'
print(f"Saving predictions to {predictions_file}...")
df_predictions.to_csv(predictions_file, index=False)
print("Predictions saved successfully.")
print(df_predictions.head())

Predicting unknown ratings...
Generated 647 predictions.
Saving predictions to ../../results/3_2_1_1_3_predictions.csv...
Predictions saved successfully.
       TargetUser        Item  PredictedRating SimilarityType
0  A1ER5AYS3FQ9O3  B004Z74P0K              2.0     Similarity
1  A1ER5AYS3FQ9O3  B00BXJZ3KY              4.5     Similarity
2  A1ER5AYS3FQ9O3  B0072ZQGIG              4.0     Similarity
3  A1ER5AYS3FQ9O3  B01HCVG51M              4.0     Similarity
4  A1ER5AYS3FQ9O3  B000CDWNSW              5.0     Similarity


### 3.2.1.1.4

This part introduces a Discounted Similarity (DS) metric by adjusting cosine similarity using a discount factor based on the number of common items between users.  

For each target user, a threshold β is computed as 30% of the number of items they rated. 

DS is then calculated as:   **DS = Similarity × DiscountFactor**,  
and the results are saved to:  
`3_2_1_1_4_discounted_similarity.csv`

In [11]:
def calculate_discounted_similarity(df_similarities, user_item_ratings, beta_pct=0.3, sim_col='Similarity'):
    ds_list = []

    for target_user, group in df_similarities.groupby('TargetUser'):
        num_rated_by_target = len(user_item_ratings.get(target_user, {}))
        
        beta = math.ceil(num_rated_by_target * beta_pct)
        
        for _, row in group.iterrows():
            other_user = row['OtherUser']
            similarity = row[sim_col]
            common_items = row['CommonItems']
            
            if beta > 0:
                df = min(common_items / beta, 1.0)
            else:
                df = 1.0 
                
            ds = similarity * df
            
            ds_entry = row.to_dict()
            ds_entry['DiscountFactor'] = round(df, 2)
            ds_entry['DiscountedSimilarity'] = round(ds, 2)
            
            ds_list.append(ds_entry)

    return pd.DataFrame(ds_list)

In [45]:
print("Calculating Discounted Similarity...")

df_ds = calculate_discounted_similarity(df_results_filtered, user_item_ratings, beta_pct=0.3, sim_col='Similarity')

print("DS calculation complete.")
print(df_ds.head())

ds_output_file = '../../results/3_2_1_1_4_discounted_similarity.csv'
print(f"Saving Discounted Similarity to {ds_output_file}...")
df_ds.to_csv(ds_output_file, index=False)
print("Discounted Similarity saved successfully.")

Calculating Discounted Similarity...
DS calculation complete.
       TargetUser       OtherUser  Similarity  CommonItems  DiscountFactor  \
0  A1ER5AYS3FQ9O3  A2JCJJNY43QQIV        1.00            3            0.21   
1  A1ER5AYS3FQ9O3   AM9APPMIE1BHZ        1.00            3            0.21   
2  A1ER5AYS3FQ9O3  A37PV5GMP2ILJC        1.00            2            0.14   
3  A1ER5AYS3FQ9O3  A259HHYBP6ZNJ3        0.99            2            0.14   
4  A1ER5AYS3FQ9O3   A680RUE1FDO8B        0.99            2            0.14   

   DiscountedSimilarity  
0                  0.21  
1                  0.21  
2                  0.14  
3                  0.14  
4                  0.14  
Saving Discounted Similarity to ../../results/3_2_1_1_4_discounted_similarity.csv...
Discounted Similarity saved successfully.


### 3.2.1.1.5

This section selects the top 20% most similar users for each target user using the Discounted Similarity (DS) values.  

The filtered top-similar pairs are saved to:  
`3_2_1_1_5_top_similar_users_ds.csv`

In [13]:
print("Identifying top 20% similar users based on DS...")

df_top_ds_users = get_top_n_similar_users(df_ds, n_percentage=0.20, similarity_col='DiscountedSimilarity')

print(f"Identified {len(df_top_ds_users)} top similar users pairs (DS).")

ds_output_file = '../../results/3_2_1_1_5_top_similar_users_ds.csv'
print(f"Saving top 20% DS users to {ds_output_file}...")
df_top_ds_users.to_csv(ds_output_file, index=False)
print("Top DS users saved successfully.")
print(df_top_ds_users.head())

Identifying top 20% similar users based on DS...
Identified 21 top similar users pairs (DS).
Saving top 20% DS users to ../../results/3_2_1_1_5_top_similar_users_ds.csv...
Top DS users saved successfully.
        TargetUser       OtherUser  Similarity  CommonItems  DiscountFactor  \
58  A1ER5AYS3FQ9O3  A3R19YKNL641X3        0.98            4            0.29   
0   A1ER5AYS3FQ9O3  A2JCJJNY43QQIV        1.00            3            0.21   
14  A1ER5AYS3FQ9O3  A3OXHLG6DIBRW8        0.99            3            0.21   
1   A1ER5AYS3FQ9O3   AM9APPMIE1BHZ        1.00            3            0.21   
48  A1ER5AYS3FQ9O3  A240FRPD4MEXND        0.99            3            0.21   

    DiscountedSimilarity  
58                  0.28  
0                   0.21  
14                  0.21  
1                   0.21  
48                  0.21  


### 3.2.1.1.6

This section predicts unrated item scores using the top similar users identified via Discounted Similarity (DS).  

The generated prediction results are saved into:  
`3_2_1_1_6_predictions_ds.csv`

In [14]:
print("Predicting unknown ratings using Discounted Similarity...")

df_predictions_ds = predict_ratings(df_top_ds_users, user_item_ratings, sim_col='DiscountedSimilarity')

print(f"Generated {len(df_predictions_ds)} predictions (DS).")

predictions_ds_file = '../../results/3_2_1_1_6_predictions_ds.csv'
print(f"Saving DS-based predictions to {predictions_ds_file}...")
df_predictions_ds.to_csv(predictions_ds_file, index=False)
print("DS-based predictions saved successfully.")
print(df_predictions_ds.head())

Predicting unknown ratings using Discounted Similarity...
Generated 1846 predictions (DS).
Saving DS-based predictions to ../../results/3_2_1_1_6_predictions_ds.csv...
DS-based predictions saved successfully.
       TargetUser        Item  PredictedRating        SimilarityType
0  A1ER5AYS3FQ9O3  B00MCVPIJI              3.0  DiscountedSimilarity
1  A1ER5AYS3FQ9O3  B000ND75BG              5.0  DiscountedSimilarity
2  A1ER5AYS3FQ9O3  B005ZG0IME              2.0  DiscountedSimilarity
3  A1ER5AYS3FQ9O3  B01FFRD1NU              5.0  DiscountedSimilarity
4  A1ER5AYS3FQ9O3  B00213QXFA              4.0  DiscountedSimilarity


### Compare Results

### 3.2.1.1.7

This section compares the top similar users selected using standard cosine similarity and Discounted Similarity (DS) for each target user.  

It computes the overlap between both top-user lists, summarizes the overlap statistics in a DataFrame, and saves:

- Detailed overlapping (TargetUser, OtherUser) pairs with similarity information to:  
  `3_2_1_1_7_overlap_users.csv`  
  
- Overall comparison metrics per target user to:  
  `3_2_1_1_7_comparison_results.csv`

In [15]:
print("Comparing top users lists...")
comparison_results = []
overlap_pairs = []

all_target_users = set(df_top_similar_users['TargetUser']).union(set(df_top_ds_users['TargetUser']))

for target_user in all_target_users:
    top_users_std = set(df_top_similar_users[df_top_similar_users['TargetUser'] == target_user]['OtherUser'])
    top_users_ds = set(df_top_ds_users[df_top_ds_users['TargetUser'] == target_user]['OtherUser'])
    
    overlap = top_users_std.intersection(top_users_ds)
    overlap_count = len(overlap)

    for other_user in overlap:
        overlap_pairs.append({
            'TargetUser': target_user,
            'OtherUser': other_user
        })

    comparison_results.append({
        'TargetUser': target_user,
        'StandardCount': len(top_users_std),
        'DSCount': len(top_users_ds),
        'OverlapCount': overlap_count,
        'OverlapPercentage': round(overlap_count / len(top_users_std) * 100, 2) if len(top_users_std) > 0 else 0
    })

df_comparison = pd.DataFrame(comparison_results)
print(f"Average Overlap Percentage: {df_comparison['OverlapPercentage'].mean():.2f}%")
print(df_comparison.head())

df_overlap_keys = pd.DataFrame(overlap_pairs)

print(f"Total overlapping (TargetUser, OtherUser) pairs: {len(df_overlap_keys)}")
print(df_overlap_keys.head())

df_overlap_details = pd.merge(
    df_overlap_keys,
    df_ds,  
    on=['TargetUser', 'OtherUser'],
    how='left'
)

overlap_output_file = '../../results/3_2_1_1_7_overlap_users.csv'
df_overlap_details.to_csv(overlap_output_file, index=False)
print(f"Intersection users with details saved successfully to {overlap_output_file}.")
print(df_overlap_details.head())

df_comparison.to_csv('../../results/3_2_1_1_7_comparison_results.csv', index=False)
print("Comparison results saved successfully.") 

Comparing top users lists...
Average Overlap Percentage: 38.10%
       TargetUser  StandardCount  DSCount  OverlapCount  OverlapPercentage
0  A1ER5AYS3FQ9O3             21       21             8               38.1
Total overlapping (TargetUser, OtherUser) pairs: 8
       TargetUser       OtherUser
0  A1ER5AYS3FQ9O3  A2CVXUY1EYQGGA
1  A1ER5AYS3FQ9O3  A1KEK09ZA6J9P8
2  A1ER5AYS3FQ9O3   AJKWF4W7QD4NS
3  A1ER5AYS3FQ9O3   A5K5DIDKAML5C
4  A1ER5AYS3FQ9O3  A316XO4RWX21YN
Intersection users with details saved successfully to ../../results/3_2_1_1_7_overlap_users.csv.
       TargetUser       OtherUser  Similarity  CommonItems  DiscountFactor  \
0  A1ER5AYS3FQ9O3  A2CVXUY1EYQGGA         1.0            3            0.21   
1  A1ER5AYS3FQ9O3  A1KEK09ZA6J9P8         1.0            2            0.14   
2  A1ER5AYS3FQ9O3   AJKWF4W7QD4NS         1.0            2            0.14   
3  A1ER5AYS3FQ9O3   A5K5DIDKAML5C         1.0            2            0.14   
4  A1ER5AYS3FQ9O3  A316XO4RWX21YN         1.

### 3.2.1.1.8

This section merges the rating predictions generated using standard cosine similarity and Discounted Similarity (DS) for the same (TargetUser, Item) pairs. 
 
It computes the difference between the two prediction values and reports basic statistics.  

The comparison results are saved to:  
`3_2_1_1_8_pred_comparison.csv`

In [16]:
print("Comparing rating predictions...")

df_pred_comparison = pd.merge(
    df_predictions[['TargetUser', 'Item', 'PredictedRating']],
    df_predictions_ds[['TargetUser', 'Item', 'PredictedRating']],
    on=['TargetUser', 'Item'],
    suffixes=('_Std', '_DS'),
    how='inner'
)

df_pred_comparison['Difference'] = df_pred_comparison['PredictedRating_Std'] - df_pred_comparison['PredictedRating_DS']

print(f"Compared {len(df_pred_comparison)} common predictions.")
print(f"Average Difference: {df_pred_comparison['Difference'].abs().mean():.4f}")
print(df_pred_comparison.head())

df_pred_comparison.to_csv('../../results/3_2_1_1_8_pred_comparison.csv', index=False)
print("Predictions comparison saved successfully.") 

Comparing rating predictions...
Compared 361 common predictions.
Average Difference: 0.1080
       TargetUser        Item  PredictedRating_Std  PredictedRating_DS  \
0  A1ER5AYS3FQ9O3  B004Z74P0K                  2.0                 2.0   
1  A1ER5AYS3FQ9O3  B00BXJZ3KY                  4.5                 5.0   
2  A1ER5AYS3FQ9O3  B0072ZQGIG                  4.0                 4.0   
3  A1ER5AYS3FQ9O3  B000CDWNSW                  5.0                 5.0   
4  A1ER5AYS3FQ9O3  B001QFZMD8                  5.0                 5.0   

   Difference  
0         0.0  
1        -0.5  
2         0.0  
3         0.0  
4         0.0  
Predictions comparison saved successfully.


### 3.2.1.1.9

This section counts how many items each user has rated and stores the result in `df_user_counts`.  

It then filters all user pairs with cosine similarity equal to 1.0 from `df_results_filtered`, merges their total rated item counts for both target and other users, and saves the final table to:  
`3_2_1_1_9_perfect_similarity_pairs.csv`

In [17]:
user_rating_counts = {
    user: len(items) for user, items in user_item_ratings.items()
}

df_user_counts = pd.DataFrame(
    [{'UserID': u, 'NumRatedItems': c} for u, c in user_rating_counts.items()]
)

print("User rating counts (first 5):")
print(df_user_counts.head())

print("Finding pairs with perfect cosine similarity (1.0)...")

df_perfect = df_results_filtered[df_results_filtered['Similarity'] == 1.0].copy()

print(f"Found {len(df_perfect)} user pairs with Similarity = 1.0.")
print(df_perfect.head())

df_perfect = df_perfect.merge(
    df_user_counts.rename(columns={
        'UserID': 'TargetUser',
        'NumRatedItems': 'TargetUserTotalItems'
    }),
    on='TargetUser',
    how='left'
)

df_perfect = df_perfect.merge(
    df_user_counts.rename(columns={
        'UserID': 'OtherUser',
        'NumRatedItems': 'OtherUserTotalItems'
    }),
    on='OtherUser',
    how='left'
)

print("Perfect similarity pairs with total rated items:")
print(df_perfect.head())

perfect_output_file = '../../results/3_2_1_1_9_perfect_similarity_pairs.csv'
df_perfect.to_csv(perfect_output_file, index=False)
print(f"Perfect similarity pairs saved successfully to {perfect_output_file}.")

User rating counts (first 5):
           UserID  NumRatedItems
0  A1N070NS9CJQ2I              2
1  A3P0KRKOBQK1KN              1
2  A192HO2ICJ75VU              1
3  A2T278FKFL3BLT              1
4  A2ZUXVTW8RXBXW              1
Finding pairs with perfect cosine similarity (1.0)...
Found 53 user pairs with Similarity = 1.0.
         TargetUser       OtherUser  Similarity  CommonItems
52   A1ER5AYS3FQ9O3  A2JCJJNY43QQIV         1.0            3
55   A1ER5AYS3FQ9O3   AM9APPMIE1BHZ         1.0            3
87   A1ER5AYS3FQ9O3  A37PV5GMP2ILJC         1.0            2
154  A1ER5AYS3FQ9O3  A1S3FOP19D8W1X         1.0            2
212  A1ER5AYS3FQ9O3  A3CW0ZLUO5X2B1         1.0            2
Perfect similarity pairs with total rated items:
       TargetUser       OtherUser  Similarity  CommonItems  \
0  A1ER5AYS3FQ9O3  A2JCJJNY43QQIV         1.0            3   
1  A1ER5AYS3FQ9O3   AM9APPMIE1BHZ         1.0            3   
2  A1ER5AYS3FQ9O3  A37PV5GMP2ILJC         1.0            2   
3  A1ER5AYS3

## Case Study 2

### 3.2.1.2.1

This section defines helper functions to:
- Compute the mean rating for each user (`compute_user_means`)
- Compute rating deviations from the user mean (`compute_rating_deviations`)
- Calculate mean-centered cosine similarity between two users using these deviations (`calculate_mean_centered_cosine_similarity`)

It then uses the co-rated user pairs in `df_co_users` to compute mean-centered cosine similarities, stores the results in a DataFrame, and saves:
- All results to: `3_2_1_2_1_user_similarities_mean_centered.csv`
- A filtered version with `CommonItems > 1` to: `3_2_1_2_1_user_similarities_mean_centered_filtered.csv`

In [18]:
def compute_user_means(user_ratings):
    user_means = {}

    for user, items in user_ratings.items():
        if len(items) > 0:
            user_means[user] = sum(items.values()) / len(items)
        else:
            user_means[user] = 0.0

    return user_means

In [19]:
def compute_rating_deviations(user_ratings, user_means):
    deviations = {}

    for user, items in user_ratings.items():
        mean = user_means[user]
        deviations[user] = {
            item: (rating - mean) for item, rating in items.items()
        }

    return deviations

In [20]:
def calculate_mean_centered_cosine_similarity(user1, user2, deviations):
    u1_items = deviations.get(user1, {})
    u2_items = deviations.get(user2, {})

    # Common rated items
    common_items = set(u1_items.keys()) & set(u2_items.keys())

    if not common_items:
        return 0.0, 0

    dot = 0.0
    norm1 = 0.0
    norm2 = 0.0

    for item in common_items:
        d1 = u1_items[item]
        d2 = u2_items[item]

        dot += d1 * d2
        norm1 += d1 ** 2
        norm2 += d2 ** 2

    norm1 = norm1 ** 0.5
    norm2 = norm2 ** 0.5

    if norm1 == 0 or norm2 == 0:
        return 0.0, len(common_items)

    similarity = dot / (norm1 * norm2)

    return similarity, len(common_items)

In [21]:
print("Calculating mean-centered cosine similarities...")
mc_similarities = []

user_means = compute_user_means(user_item_ratings)
deviations = compute_rating_deviations(user_item_ratings, user_means)


for index, row in df_co_users.iterrows():
    target_user = row['TargetUser']
    other_user = row['OtherUser']
    
    sim, num_common = calculate_mean_centered_cosine_similarity(
        target_user, 
        other_user, 
        deviations
    )
    
    mc_similarities.append({
        'TargetUser': target_user,
        'OtherUser': other_user,
        'MeanCenteredSimilarity': round(sim, 2),
        'CommonItems': num_common
    })

print("Mean-centered similarity calculation complete.")

Calculating mean-centered cosine similarities...


Mean-centered similarity calculation complete.


In [46]:
print("Saving mean-centered similarity results...")
df_mc_results = pd.DataFrame(mc_similarities)
df_mc_results.to_csv('../../results/3_2_1_2_1_user_similarities_mean_centered.csv', index=False)
print("Mean-centered similarities saved successfully.")
print(df_mc_results.head())

df_mc_results_filtered = df_mc_results[df_mc_results['CommonItems'] > 1]
print(df_mc_results_filtered.head())

df_mc_results_filtered.to_csv('../../results/3_2_1_2_1_user_similarities_mean_centered_filtered.csv', index=False)
print("Filtered mean-centered similarities saved successfully.")

Saving mean-centered similarity results...
Mean-centered similarities saved successfully.
       TargetUser       OtherUser  MeanCenteredSimilarity  CommonItems
0  A1ER5AYS3FQ9O3   AAP7PPBU72QFM                     0.0            1
1  A1ER5AYS3FQ9O3   AIMPBO9K5SQ5X                    -1.0            1
2  A1ER5AYS3FQ9O3  A2QCVDCCZ3ABAC                     0.0            1
3  A1ER5AYS3FQ9O3  A1C0Y8AFKTIRWY                     0.0            1
4  A1ER5AYS3FQ9O3  A3M96C2MSACALP                     0.0            1
         TargetUser       OtherUser  MeanCenteredSimilarity  CommonItems
52   A1ER5AYS3FQ9O3  A2JCJJNY43QQIV                    1.00            3
55   A1ER5AYS3FQ9O3   AM9APPMIE1BHZ                    1.00            3
87   A1ER5AYS3FQ9O3  A37PV5GMP2ILJC                   -1.00            2
93   A1ER5AYS3FQ9O3  A259HHYBP6ZNJ3                    0.04            2
124  A1ER5AYS3FQ9O3   A680RUE1FDO8B                    0.38            2
Filtered mean-centered similarities saved succ

### 3.2.1.2.2

This section selects the top 20% most similar users per target user using mean-centered cosine similarity values.  

The selected user pairs are saved to:  
`3_2_1_2_2_top_similar_users_mean_centered.csv`

In [23]:
print("Identifying top 20% similar users based on mean-centered similarity...")

df_top_mc_users = get_top_n_similar_users(df_mc_results_filtered, n_percentage=0.20, similarity_col='MeanCenteredSimilarity')

print(f"Identified {len(df_top_mc_users)} top similar user pairs (mean-centered).")

top_mc_output_file = '../../results/3_2_1_2_2_top_similar_users_mean_centered.csv'
print(f"Saving top 20% mean-centered similar users to {top_mc_output_file}...")

df_top_mc_users.to_csv(top_mc_output_file, index=False)

print("Top 20% mean-centered users saved successfully.")
print(df_top_mc_users.head())


Identifying top 20% similar users based on mean-centered similarity...
Identified 21 top similar user pairs (mean-centered).
Saving top 20% mean-centered similar users to ../../results/3_2_1_2_2_top_similar_users_mean_centered.csv...
Top 20% mean-centered users saved successfully.
          TargetUser       OtherUser  MeanCenteredSimilarity  CommonItems
52    A1ER5AYS3FQ9O3  A2JCJJNY43QQIV                     1.0            3
6511  A1ER5AYS3FQ9O3  A3LY8LCT1QE8I1                     1.0            2
378   A1ER5AYS3FQ9O3  A19W47CXJJP1MI                     1.0            2
1063  A1ER5AYS3FQ9O3  A2DPSV4CTHV3YA                     1.0            2
691   A1ER5AYS3FQ9O3  A35X2JJI49OBZP                     1.0            2


### 3.2.1.2.3

This section predicts ratings for unrated items by applying the weighted mean-centered cosine similarity from the selected top similar users.
  
The resulting predictions are saved to:  
`3_2_1_2_3_predictions_mean_centered.csv`

In [24]:
print("Predicting unknown ratings using mean-centered similarity...")

df_predictions_mc = predict_ratings(df_top_mc_users, user_item_ratings, sim_col='MeanCenteredSimilarity')

print(f"Generated {len(df_predictions_mc)} predictions (mean-centered).")

predictions_mc_file = '../../results/3_2_1_2_3_predictions_mean_centered.csv'
print(f"Saving mean-centered predictions to {predictions_mc_file}...")

df_predictions_mc.to_csv(predictions_mc_file, index=False)

print("Mean-centered predictions saved successfully.")
print(df_predictions_mc.head())

Predicting unknown ratings using mean-centered similarity...
Generated 1803 predictions (mean-centered).
Saving mean-centered predictions to ../../results/3_2_1_2_3_predictions_mean_centered.csv...
Mean-centered predictions saved successfully.
       TargetUser        Item  PredictedRating          SimilarityType
0  A1ER5AYS3FQ9O3  B0019FHM9M              1.0  MeanCenteredSimilarity
1  A1ER5AYS3FQ9O3  B00Q543KL6              5.0  MeanCenteredSimilarity
2  A1ER5AYS3FQ9O3  B001UQ6F4S              5.0  MeanCenteredSimilarity
3  A1ER5AYS3FQ9O3  B00DQIST6A              4.0  MeanCenteredSimilarity
4  A1ER5AYS3FQ9O3  B004W7PHVO              4.0  MeanCenteredSimilarity


### 3.2.1.2.4

This section applies the Discounted Similarity (DS) method to mean-centered cosine similarity results by incorporating a discount factor based on common rated items. 
 
The results are saved to:  
`3_2_1_2_4_discounted_similarity_mean_centered.csv`

In [49]:
print("Calculating Discounted Mean-Centered Similarity...")

df_ds_mc = calculate_discounted_similarity(df_mc_results_filtered,user_item_ratings,beta_pct=0.3, sim_col='MeanCenteredSimilarity')

print("Mean-centered DS calculation complete.")
print(df_ds_mc.head())

ds_mc_output_file = '../../results/3_2_1_2_4_discounted_similarity_mean_centered.csv'
print(f"Saving Mean-Centered Discounted Similarity to {ds_mc_output_file}...")

df_ds_mc.to_csv(ds_mc_output_file, index=False)

print("Mean-Centered Discounted Similarity saved successfully.")

Calculating Discounted Mean-Centered Similarity...
Mean-centered DS calculation complete.
       TargetUser       OtherUser  MeanCenteredSimilarity  CommonItems  \
0  A1ER5AYS3FQ9O3  A2JCJJNY43QQIV                    1.00            3   
1  A1ER5AYS3FQ9O3   AM9APPMIE1BHZ                    1.00            3   
2  A1ER5AYS3FQ9O3  A37PV5GMP2ILJC                   -1.00            2   
3  A1ER5AYS3FQ9O3  A259HHYBP6ZNJ3                    0.04            2   
4  A1ER5AYS3FQ9O3   A680RUE1FDO8B                    0.38            2   

   DiscountFactor  DiscountedSimilarity  
0            0.21                  0.21  
1            0.21                  0.21  
2            0.14                 -0.14  
3            0.14                  0.01  
4            0.14                  0.05  
Saving Mean-Centered Discounted Similarity to ../../results/3_2_1_2_4_discounted_similarity_mean_centered.csv...
Mean-Centered Discounted Similarity saved successfully.


### 3.2.1.2.5

This section selects the top 20% most similar users per target user using Discounted Similarity values derived from mean-centered cosine similarity. 
 
The resulting user pairs are saved to:  
`3_2_1_2_5_top_similar_users_mean_centered_ds.csv`

In [26]:
print("Identifying top 20% similar users based on Mean-Centered DS...")

df_top_ds_mc_users = get_top_n_similar_users(df_ds_mc,n_percentage=0.20,similarity_col='DiscountedSimilarity')

print(f"Identified {len(df_top_ds_mc_users)} top similar user pairs (Mean-Centered DS).")

top_ds_mc_output_file = '../../results/3_2_1_2_5_top_similar_users_mean_centered_ds.csv'
print(f"Saving top 20% Mean-Centered DS users to {top_ds_mc_output_file}...")

df_top_ds_mc_users.to_csv(top_ds_mc_output_file, index=False)

print("Top Mean-Centered DS users saved successfully.")
print(df_top_ds_mc_users.head())

Identifying top 20% similar users based on Mean-Centered DS...
Identified 21 top similar user pairs (Mean-Centered DS).
Saving top 20% Mean-Centered DS users to ../../results/3_2_1_2_5_top_similar_users_mean_centered_ds.csv...
Top Mean-Centered DS users saved successfully.
        TargetUser       OtherUser  MeanCenteredSimilarity  CommonItems  \
0   A1ER5AYS3FQ9O3  A2JCJJNY43QQIV                    1.00            3   
1   A1ER5AYS3FQ9O3   AM9APPMIE1BHZ                    1.00            3   
71  A1ER5AYS3FQ9O3  A2CVXUY1EYQGGA                    0.98            3   
54  A1ER5AYS3FQ9O3   AWPN47SSWK1JV                    0.96            3   
15  A1ER5AYS3FQ9O3   ASY25YMTIC2A9                    0.74            3   

    DiscountFactor  DiscountedSimilarity  
0             0.21                  0.21  
1             0.21                  0.21  
71            0.21                  0.21  
54            0.21                  0.21  
15            0.21                  0.16  


### 3.2.1.2.6

This section predicts ratings for unrated items using the top similar users identified by Mean-Centered Discounted Similarity. 
 
The generated predictions are saved to:  
`3_2_1_2_6_predictions_mean_centered_ds.csv`

In [27]:
print("Predicting unknown ratings using Discounted Mean-Centered Similarity...")

df_predictions_ds_mc = predict_ratings(df_top_ds_mc_users,user_item_ratings,sim_col='DiscountedSimilarity')

print(f"Generated {len(df_predictions_ds_mc)} predictions (Mean-Centered DS).")

predictions_ds_mc_file = '../../results/3_2_1_2_6_predictions_mean_centered_ds.csv'
print(f"Saving Mean-Centered DS-based predictions to {predictions_ds_mc_file}...")

df_predictions_ds_mc.to_csv(predictions_ds_mc_file, index=False)

print("Mean-Centered DS predictions saved successfully.")
print(df_predictions_ds_mc.head())

Predicting unknown ratings using Discounted Mean-Centered Similarity...
Generated 1766 predictions (Mean-Centered DS).
Saving Mean-Centered DS-based predictions to ../../results/3_2_1_2_6_predictions_mean_centered_ds.csv...
Mean-Centered DS predictions saved successfully.
       TargetUser        Item  PredictedRating        SimilarityType
0  A1ER5AYS3FQ9O3  B00MCVPIJI              3.0  DiscountedSimilarity
1  A1ER5AYS3FQ9O3  B005ZG0IME              2.0  DiscountedSimilarity
2  A1ER5AYS3FQ9O3  B01FFRD1NU              5.0  DiscountedSimilarity
3  A1ER5AYS3FQ9O3  B0019FHM9M              1.0  DiscountedSimilarity
4  A1ER5AYS3FQ9O3  B001UQ6F4S              5.0  DiscountedSimilarity


### 3.2.1.2.7

This section compares the top similar users selected using Mean-Centered Cosine Similarity versus Mean-Centered Discounted Similarity (DS).  

It calculates overlap statistics for each target user and saves:
- Detailed overlapping user pairs to:  
  `3_2_1_2_7_overlap_users_mean_centered.csv`  

- Summary comparison statistics per target user to:  
  `3_2_1_2_7_comparison_results_mean_centered.csv`

In [28]:
print("Comparing top users lists (Mean-Centered)...")

mc_comparison_results = []
mc_overlap_pairs = []

all_target_users_mc = set(df_top_mc_users['TargetUser']).union(
    set(df_top_ds_mc_users['TargetUser'])
)

for target_user in all_target_users_mc:
    top_users_mc = set(
        df_top_mc_users[df_top_mc_users['TargetUser'] == target_user]['OtherUser']
    )
    top_users_ds_mc = set(
        df_top_ds_mc_users[df_top_ds_mc_users['TargetUser'] == target_user]['OtherUser']
    )
    
    overlap = top_users_mc.intersection(top_users_ds_mc)
    overlap_count = len(overlap)

    for other_user in overlap:
        mc_overlap_pairs.append({
            'TargetUser': target_user,
            'OtherUser': other_user
        })

    mc_comparison_results.append({
        'TargetUser': target_user,
        'MCCount': len(top_users_mc),
        'MC_DSCount': len(top_users_ds_mc),
        'OverlapCount': overlap_count,
        'OverlapPercentage': round(overlap_count / len(top_users_mc) * 100, 2) 
                            if len(top_users_mc) > 0 else 0
    })

df_mc_comparison = pd.DataFrame(mc_comparison_results)
print(f"Average Overlap Percentage (Mean-Centered): {df_mc_comparison['OverlapPercentage'].mean():.2f}%")
print(df_mc_comparison.head())

df_mc_overlap_keys = pd.DataFrame(mc_overlap_pairs)

print(f"Total overlapping (TargetUser, OtherUser) pairs (Mean-Centered): {len(df_mc_overlap_keys)}")
print(df_mc_overlap_keys.head())

df_mc_overlap_details = pd.merge(
    df_mc_overlap_keys,
    df_ds_mc,   
    on=['TargetUser', 'OtherUser'],
    how='left'
)

overlap_mc_output_file = '../../results/3_2_1_2_7_overlap_users_mean_centered.csv'
df_mc_overlap_details.to_csv(overlap_mc_output_file, index=False)
print(f"Intersection users (Mean-Centered) with details saved successfully to {overlap_mc_output_file}.")
print(df_mc_overlap_details.head())

comparison_mc_output_file = '../../results/3_2_1_2_7_comparison_results_mean_centered.csv'
df_mc_comparison.to_csv(comparison_mc_output_file, index=False)
print("Mean-Centered comparison results saved successfully.")

Comparing top users lists (Mean-Centered)...
Average Overlap Percentage (Mean-Centered): 66.67%
       TargetUser  MCCount  MC_DSCount  OverlapCount  OverlapPercentage
0  A1ER5AYS3FQ9O3       21          21            14              66.67
Total overlapping (TargetUser, OtherUser) pairs (Mean-Centered): 14
       TargetUser       OtherUser
0  A1ER5AYS3FQ9O3  A18DQ9ZJOPUWCO
1  A1ER5AYS3FQ9O3  A11B61QBGHLQDN
2  A1ER5AYS3FQ9O3  A35X2JJI49OBZP
3  A1ER5AYS3FQ9O3  A2JCJJNY43QQIV
4  A1ER5AYS3FQ9O3  A3AUL23GMCOP2A
Intersection users (Mean-Centered) with details saved successfully to ../../results/3_2_1_2_7_overlap_users_mean_centered.csv.
       TargetUser       OtherUser  MeanCenteredSimilarity  CommonItems  \
0  A1ER5AYS3FQ9O3  A18DQ9ZJOPUWCO                     1.0            2   
1  A1ER5AYS3FQ9O3  A11B61QBGHLQDN                     1.0            2   
2  A1ER5AYS3FQ9O3  A35X2JJI49OBZP                     1.0            2   
3  A1ER5AYS3FQ9O3  A2JCJJNY43QQIV                     1.0        

### 3.2.1.2.8

This section compares the predicted ratings generated using Mean-Centered Cosine Similarity and Mean-Centered Discounted Similarity (DS) for identical (TargetUser, Item) pairs.  

It computes the difference between the two predictions and saves the comparison results to:  
`3_2_1_2_8_pred_comparison_mean_centered.csv`

In [29]:
print("Comparing rating predictions (Mean-Centered)...")

df_pred_comparison_mc = pd.merge(
    df_predictions_mc[['TargetUser', 'Item', 'PredictedRating']],
    df_predictions_ds_mc[['TargetUser', 'Item', 'PredictedRating']],
    on=['TargetUser', 'Item'],
    suffixes=('_MC', '_MC_DS'),
    how='inner'
)

df_pred_comparison_mc['Difference'] = \
    df_pred_comparison_mc['PredictedRating_MC'] - df_pred_comparison_mc['PredictedRating_MC_DS']

print(f"Compared {len(df_pred_comparison_mc)} common predictions (Mean-Centered).")
print(f"Average Difference: {df_pred_comparison_mc['Difference'].abs().mean():.4f}")
print(df_pred_comparison_mc.head())

pred_comp_mc_output = '../../results/3_2_1_2_8_pred_comparison_mean_centered.csv'
df_pred_comparison_mc.to_csv(pred_comp_mc_output, index=False)

print("Mean-Centered predictions comparison saved successfully.")

Comparing rating predictions (Mean-Centered)...
Compared 1242 common predictions (Mean-Centered).
Average Difference: 0.0532
       TargetUser        Item  PredictedRating_MC  PredictedRating_MC_DS  \
0  A1ER5AYS3FQ9O3  B0019FHM9M                 1.0                    1.0   
1  A1ER5AYS3FQ9O3  B001UQ6F4S                 5.0                    5.0   
2  A1ER5AYS3FQ9O3  B00DQIST6A                 4.0                    4.0   
3  A1ER5AYS3FQ9O3  B004W7PHVO                 4.0                    4.0   
4  A1ER5AYS3FQ9O3  B004XIOJ7A                 5.0                    5.0   

   Difference  
0         0.0  
1         0.0  
2         0.0  
3         0.0  
4         0.0  
Mean-Centered predictions comparison saved successfully.


### 3.2.1.2.9

This section finds user pairs where the raw cosine similarity is highly positive (≥ 1.0) while the mean-centered similarity equals -1.0. 
 
It merges raw and mean-centered similarity results, filters such “flipped” pairs, and saves them to:  
`3_2_1_2_9_raw_high_mc_minus1_pairs.csv`  

It then builds item-level details for these pairs, including ratings, user means, and deviations for each common item, and saves them to:  
`3_2_1_2_9_raw_high_mc_minus1_pairs_items.csv`

In [30]:
print("Finding user pairs where raw cosine is highly +ve but mean-centered similarity is -1.0...")

HIGH_SIM_THRESHOLD = 1  

df_merged_sims = pd.merge(
    df_results_filtered[['TargetUser', 'OtherUser', 'Similarity', 'CommonItems']],
    df_mc_results_filtered[['TargetUser', 'OtherUser', 'MeanCenteredSimilarity']],
    on=['TargetUser', 'OtherUser'],
    how='inner'
)

df_flip_pairs = df_merged_sims[
    (df_merged_sims['Similarity'] >= HIGH_SIM_THRESHOLD) &
    (df_merged_sims['MeanCenteredSimilarity'] == -1.0)
].copy()

print(f"Found {len(df_flip_pairs)} user pairs with raw cosine ≥ {HIGH_SIM_THRESHOLD} and mean-centered = -1.0.")
print(df_flip_pairs.head())

flip_output_file = '../../results/3_2_1_2_9_raw_high_mc_minus1_pairs.csv'
df_flip_pairs.to_csv(flip_output_file, index=False)
print(f"Flipped similarity pairs saved successfully to {flip_output_file}.")

Finding user pairs where raw cosine is highly +ve but mean-centered similarity is -1.0...
Found 15 user pairs with raw cosine ≥ 1 and mean-centered = -1.0.
        TargetUser       OtherUser  Similarity  CommonItems  \
2   A1ER5AYS3FQ9O3  A37PV5GMP2ILJC         1.0            2   
22  A1ER5AYS3FQ9O3  A10ZBR6O8S8OCY         1.0            2   
23  A1ER5AYS3FQ9O3  A20DDH4NT6Q1E8         1.0            2   
27  A1ER5AYS3FQ9O3  A1VLVWTLV3LVHR         1.0            2   
37  A1ER5AYS3FQ9O3  A2NICGGIGIFU22         1.0            2   

    MeanCenteredSimilarity  
2                     -1.0  
22                    -1.0  
23                    -1.0  
27                    -1.0  
37                    -1.0  
Flipped similarity pairs saved successfully to ../../results/3_2_1_2_9_raw_high_mc_minus1_pairs.csv.


In [31]:
print("Building item-level details for flipped pairs (means and deviations)...")

flip_details = []

for _, row in df_flip_pairs.iterrows():
    target_user = row['TargetUser']
    other_user = row['OtherUser']
    
    t_items = user_item_ratings.get(target_user, {})
    o_items = user_item_ratings.get(other_user, {})
    
    common_items = set(t_items.keys()) & set(o_items.keys())
    
    t_mean = user_means.get(target_user, 0.0)
    o_mean = user_means.get(other_user, 0.0)
    
    for item in common_items:
        t_rating = t_items[item]
        o_rating = o_items[item]
        
        t_dev = t_rating - t_mean
        o_dev = o_rating - o_mean
        
        flip_details.append({
            'TargetUser': target_user,
            'OtherUser': other_user,
            'ItemID': item,
            
            'RawCosineSimilarity': row['Similarity'],
            'MeanCenteredSimilarity': row['MeanCenteredSimilarity'],
            'CommonItemsCount': row['CommonItems'],
            
            'TargetRating': t_rating,
            'OtherRating': o_rating,
            
            'TargetUserMean': t_mean,
            'OtherUserMean': o_mean,
            
            'TargetDeviation': t_dev,
            'OtherDeviation': o_dev
        })

df_flip_details = pd.DataFrame(flip_details)

print(f"Built item-level details for {len(df_flip_details)} (user, item) rows.")
print(df_flip_details.head())

flip_details_output_file = '../../results/3_2_1_2_9_raw_high_mc_minus1_pairs_items.csv'
df_flip_details.to_csv(flip_details_output_file, index=False)
print(f"Item-level flipped similarity details saved successfully to {flip_details_output_file}.")

Building item-level details for flipped pairs (means and deviations)...
Built item-level details for 30 (user, item) rows.
       TargetUser       OtherUser      ItemID  RawCosineSimilarity  \
0  A1ER5AYS3FQ9O3  A37PV5GMP2ILJC  B000RHZJN4                  1.0   
1  A1ER5AYS3FQ9O3  A37PV5GMP2ILJC  B000V1MLBE                  1.0   
2  A1ER5AYS3FQ9O3  A10ZBR6O8S8OCY  B00A6HYP1W                  1.0   
3  A1ER5AYS3FQ9O3  A10ZBR6O8S8OCY  B004HKJTT2                  1.0   
4  A1ER5AYS3FQ9O3  A20DDH4NT6Q1E8  B00E5JUEF8                  1.0   

   MeanCenteredSimilarity  CommonItemsCount  TargetRating  OtherRating  \
0                    -1.0                 2           5.0          4.0   
1                    -1.0                 2           5.0          4.0   
2                    -1.0                 2           4.0          5.0   
3                    -1.0                 2           4.0          5.0   
4                    -1.0                 2           5.0          4.0   

   TargetUs

## Case Study 3

### 3.2.1.3.1

This section defines a function to compute Pearson Correlation Coefficient (PCC) similarity between two users based on their common rated items.  

It then calculates PCC similarities for all user pairs in `df_co_users`, saves all results to:  
`3_2_1_3_1_user_similarities_pearson.csv`  
and a filtered version with `CommonItems > 1` to:  
`3_2_1_3_1_user_similarities_pearson_filtered.csv`

In [32]:
def calculate_pearson_similarity(user1, user2, user_ratings):
    u1_items = user_ratings.get(user1, {})
    u2_items = user_ratings.get(user2, {})

    common_items = set(u1_items.keys()) & set(u2_items.keys())

    if not common_items:
        return 0.0, 0

    u1_ratings = []
    for i in common_items:
        u1_ratings.append(u1_items[i])

    u2_ratings = []
    for i in common_items:
        u2_ratings.append(u2_items[i])


    mean1 = sum(u1_ratings) / len(u1_ratings)
    mean2 = sum(u2_ratings) / len(u2_ratings)

    numerator = 0.0
    denom1 = 0.0
    denom2 = 0.0

    for item in common_items:
        d1 = u1_items[item] - mean1
        d2 = u2_items[item] - mean2

        numerator += d1 * d2
        denom1 += d1 ** 2
        denom2 += d2 ** 2

    if denom1 == 0 or denom2 == 0:
        return 0.0, len(common_items)

    similarity = numerator / ((denom1 ** 0.5) * (denom2 ** 0.5))

    return similarity, len(common_items)

In [33]:
print("Calculating Pearson (PCC) similarities...")
pcc_similarities = []

for index, row in df_co_users.iterrows():
    target_user = row['TargetUser']
    other_user = row['OtherUser']
    
    sim, num_common = calculate_pearson_similarity(
        target_user,
        other_user,
        user_item_ratings
    )
    
    pcc_similarities.append({
        'TargetUser': target_user,
        'OtherUser': other_user,
        'PearsonSimilarity': round(sim, 2), 
        'CommonItems': num_common
    })

print("Pearson similarity calculation complete.")

Calculating Pearson (PCC) similarities...
Pearson similarity calculation complete.


In [34]:
print("Saving Pearson similarity results...")
df_pcc_results = pd.DataFrame(pcc_similarities)
df_pcc_results.to_csv('../../results/3_2_1_3_1_user_similarities_pearson.csv', index=False)
print("Pearson similarities saved successfully.")
print(df_pcc_results.head())

df_pcc_results_filtered = df_pcc_results[df_pcc_results['CommonItems'] > 1]
print(df_pcc_results_filtered.head())

df_pcc_results_filtered.to_csv('../../results/3_2_1_3_1_user_similarities_pearson_filtered.csv', index=False)
print("Filtered Pearson similarities saved successfully.")
print(df_pcc_results_filtered.head())

Saving Pearson similarity results...
Pearson similarities saved successfully.
       TargetUser       OtherUser  PearsonSimilarity  CommonItems
0  A1ER5AYS3FQ9O3   AAP7PPBU72QFM                0.0            1
1  A1ER5AYS3FQ9O3   AIMPBO9K5SQ5X                0.0            1
2  A1ER5AYS3FQ9O3  A2QCVDCCZ3ABAC                0.0            1
3  A1ER5AYS3FQ9O3  A1C0Y8AFKTIRWY                0.0            1
4  A1ER5AYS3FQ9O3  A3M96C2MSACALP                0.0            1
         TargetUser       OtherUser  PearsonSimilarity  CommonItems
52   A1ER5AYS3FQ9O3  A2JCJJNY43QQIV                0.0            3
55   A1ER5AYS3FQ9O3   AM9APPMIE1BHZ                1.0            3
87   A1ER5AYS3FQ9O3  A37PV5GMP2ILJC                0.0            2
93   A1ER5AYS3FQ9O3  A259HHYBP6ZNJ3                0.0            2
124  A1ER5AYS3FQ9O3   A680RUE1FDO8B                0.0            2
Filtered Pearson similarities saved successfully.
         TargetUser       OtherUser  PearsonSimilarity  CommonItems


### 3.2.1.3.2

This section selects the top 20% most similar user pairs for each target user using Pearson similarity scores.  

The selected pairs are saved to:  
`3_2_1_3_2_top_similar_users_pearson.csv`

In [35]:
print("Identifying top 20% similar users based on Pearson similarity...")

df_top_pcc_users = get_top_n_similar_users(df_pcc_results_filtered,n_percentage=0.20,similarity_col='PearsonSimilarity')

print(f"Identified {len(df_top_pcc_users)} top similar user pairs (Pearson).")

top_pcc_output_file = '../../results/3_2_1_3_2_top_similar_users_pearson.csv'
print(f"Saving top 20% Pearson similar users to {top_pcc_output_file}...")

df_top_pcc_users.to_csv(top_pcc_output_file, index=False)

print("Top 20% Pearson users saved successfully.")
print(df_top_pcc_users.head())

Identifying top 20% similar users based on Pearson similarity...
Identified 21 top similar user pairs (Pearson).
Saving top 20% Pearson similar users to ../../results/3_2_1_3_2_top_similar_users_pearson.csv...
Top 20% Pearson users saved successfully.
          TargetUser       OtherUser  PearsonSimilarity  CommonItems
1956  A1ER5AYS3FQ9O3   AWPN47SSWK1JV                1.0            3
1216  A1ER5AYS3FQ9O3  A3SEBFKE82AFF0                1.0            2
4905  A1ER5AYS3FQ9O3  A19YOYY7FLQMA6                1.0            2
55    A1ER5AYS3FQ9O3   AM9APPMIE1BHZ                1.0            3
902   A1ER5AYS3FQ9O3  A2QDOJFFLFGF18                1.0            2


### 3.2.1.3.3

This section predicts ratings for unrated items using the weighted Pearson similarity values from the top similar users. 
 
The prediction results are saved to:  
`3_2_1_3_3_predictions_pearson.csv`

In [36]:
print("Predicting unknown ratings using Pearson similarity...")

df_predictions_pcc = predict_ratings(df_top_pcc_users,user_item_ratings,sim_col='PearsonSimilarity')

print(f"Generated {len(df_predictions_pcc)} predictions (Pearson).")

predictions_pcc_file = '../../results/3_2_1_3_3_predictions_pearson.csv'
print(f"Saving Pearson-based predictions to {predictions_pcc_file}...")

df_predictions_pcc.to_csv(predictions_pcc_file, index=False)

print("Pearson-based predictions saved successfully.")
print(df_predictions_pcc.head())


Predicting unknown ratings using Pearson similarity...
Generated 1125 predictions (Pearson).
Saving Pearson-based predictions to ../../results/3_2_1_3_3_predictions_pearson.csv...
Pearson-based predictions saved successfully.
       TargetUser        Item  PredictedRating     SimilarityType
0  A1ER5AYS3FQ9O3  B00MCVPIJI              3.0  PearsonSimilarity
1  A1ER5AYS3FQ9O3  B000ND75BG              5.0  PearsonSimilarity
2  A1ER5AYS3FQ9O3  B005ZG0IME              2.0  PearsonSimilarity
3  A1ER5AYS3FQ9O3  B01FFRD1NU              5.0  PearsonSimilarity
4  A1ER5AYS3FQ9O3  B00H888AMC              4.0  PearsonSimilarity


### 3.2.1.3.4

This section applies the Discounted Similarity (DS) adjustment to the Pearson similarity values, using a discount factor based on the number of common rated items.  

The resulting discounted Pearson similarities are saved to:  
`3_2_1_3_4_discounted_similarity_pearson.csv`

In [50]:
print("Calculating Discounted Pearson Similarity...")

df_ds_pcc = calculate_discounted_similarity(df_pcc_results_filtered,user_item_ratings,beta_pct=0.3,sim_col='PearsonSimilarity')

print("PCC-based DS calculation complete.")
print(df_ds_pcc.head())

ds_pcc_output_file = '../../results/3_2_1_3_4_discounted_similarity_pearson.csv'
print(f"Saving Discounted Pearson Similarity to {ds_pcc_output_file}...")

df_ds_pcc.to_csv(ds_pcc_output_file, index=False)

print("Discounted Pearson Similarity saved successfully.")

Calculating Discounted Pearson Similarity...
PCC-based DS calculation complete.
       TargetUser       OtherUser  PearsonSimilarity  CommonItems  \
0  A1ER5AYS3FQ9O3  A2JCJJNY43QQIV                0.0            3   
1  A1ER5AYS3FQ9O3   AM9APPMIE1BHZ                1.0            3   
2  A1ER5AYS3FQ9O3  A37PV5GMP2ILJC                0.0            2   
3  A1ER5AYS3FQ9O3  A259HHYBP6ZNJ3                0.0            2   
4  A1ER5AYS3FQ9O3   A680RUE1FDO8B                0.0            2   

   DiscountFactor  DiscountedSimilarity  
0            0.21                  0.00  
1            0.21                  0.21  
2            0.14                  0.00  
3            0.14                  0.00  
4            0.14                  0.00  
Saving Discounted Pearson Similarity to ../../results/3_2_1_3_4_discounted_similarity_pearson.csv...
Discounted Pearson Similarity saved successfully.


### 3.2.1.3.5

This section selects the top 20% most similar user pairs for each target user using Discounted Pearson Similarity values.  

The selected top user pairs are saved to:  
`3_2_1_3_5_top_similar_users_pearson_ds.csv`

In [38]:
print("Identifying top 20% similar users based on Discounted Pearson Similarity...")

df_top_ds_pcc_users = get_top_n_similar_users(df_ds_pcc,n_percentage=0.20,similarity_col='DiscountedSimilarity')

print(f"Identified {len(df_top_ds_pcc_users)} top similar user pairs (PCC DS).")

top_ds_pcc_output_file = '../../results/3_2_1_3_5_top_similar_users_pearson_ds.csv'
print(f"Saving top 20% PCC DS users to {top_ds_pcc_output_file}...")

df_top_ds_pcc_users.to_csv(top_ds_pcc_output_file, index=False)

print("Top PCC DS users saved successfully.")
print(df_top_ds_pcc_users.head())


Identifying top 20% similar users based on Discounted Pearson Similarity...
Identified 21 top similar user pairs (PCC DS).
Saving top 20% PCC DS users to ../../results/3_2_1_3_5_top_similar_users_pearson_ds.csv...
Top PCC DS users saved successfully.
        TargetUser       OtherUser  PearsonSimilarity  CommonItems  \
54  A1ER5AYS3FQ9O3   AWPN47SSWK1JV                1.0            3   
48  A1ER5AYS3FQ9O3  A240FRPD4MEXND                1.0            3   
71  A1ER5AYS3FQ9O3  A2CVXUY1EYQGGA                1.0            3   
1   A1ER5AYS3FQ9O3   AM9APPMIE1BHZ                1.0            3   
17  A1ER5AYS3FQ9O3  A3V6Z4RCDGRC44                1.0            3   

    DiscountFactor  DiscountedSimilarity  
54            0.21                  0.21  
48            0.21                  0.21  
71            0.21                  0.21  
1             0.21                  0.21  
17            0.21                  0.21  


### 3.2.1.3.6

This section generates rating predictions for unrated items using Discounted Pearson Similarity values from the top similar users.  

The resulting predictions are saved to:  
`3_2_1_3_6_predictions_pearson_ds.csv`

In [39]:
print("Predicting unknown ratings using Discounted Pearson Similarity...")

df_predictions_ds_pcc = predict_ratings(df_top_ds_pcc_users,user_item_ratings,sim_col='DiscountedSimilarity')

print(f"Generated {len(df_predictions_ds_pcc)} predictions (PCC DS).")

predictions_ds_pcc_file = '../../results/3_2_1_3_6_predictions_pearson_ds.csv'
print(f"Saving PCC DS-based predictions to {predictions_ds_pcc_file}...")

df_predictions_ds_pcc.to_csv(predictions_ds_pcc_file, index=False)

print("PCC DS-based predictions saved successfully.")
print(df_predictions_ds_pcc.head())

Predicting unknown ratings using Discounted Pearson Similarity...
Generated 1125 predictions (PCC DS).
Saving PCC DS-based predictions to ../../results/3_2_1_3_6_predictions_pearson_ds.csv...
PCC DS-based predictions saved successfully.
       TargetUser        Item  PredictedRating        SimilarityType
0  A1ER5AYS3FQ9O3  B00MCVPIJI              3.0  DiscountedSimilarity
1  A1ER5AYS3FQ9O3  B000ND75BG              5.0  DiscountedSimilarity
2  A1ER5AYS3FQ9O3  B005ZG0IME              2.0  DiscountedSimilarity
3  A1ER5AYS3FQ9O3  B01FFRD1NU              5.0  DiscountedSimilarity
4  A1ER5AYS3FQ9O3  B00H888AMC              4.0  DiscountedSimilarity


### 3.2.1.3.7

This section compares the top similar users selected using Pearson similarity versus Discounted Pearson similarity.  

It measures how many users appear in both lists (overlap), calculates overlap percentages for each target user, and saves:

- Detailed overlap entries to:  
  `3_2_1_3_7_overlap_users_pearson.csv`
- Summary comparison statistics to:  
  `3_2_1_3_7_comparison_results_pearson.csv`

In [40]:
print("Comparing top users lists (PCC)...")
pcc_comparison_results = []
pcc_overlap_pairs = []

all_target_users_pcc = set(df_top_pcc_users['TargetUser']).union(
    set(df_top_ds_pcc_users['TargetUser'])
)

for target_user in all_target_users_pcc:
    top_users_pcc = set(
        df_top_pcc_users[df_top_pcc_users['TargetUser'] == target_user]['OtherUser']
    )
    top_users_ds_pcc = set(
        df_top_ds_pcc_users[df_top_ds_pcc_users['TargetUser'] == target_user]['OtherUser']
    )
    
    overlap = top_users_pcc.intersection(top_users_ds_pcc)
    overlap_count = len(overlap)

    for other_user in overlap:
        pcc_overlap_pairs.append({
            'TargetUser': target_user,
            'OtherUser': other_user
        })

    pcc_comparison_results.append({
        'TargetUser': target_user,
        'PCCCount': len(top_users_pcc),
        'PCC_DSCount': len(top_users_ds_pcc),
        'OverlapCount': overlap_count,
        'OverlapPercentage': round(overlap_count / len(top_users_pcc) * 100, 2)
                            if len(top_users_pcc) > 0 else 0
    })

df_pcc_comparison = pd.DataFrame(pcc_comparison_results)
print(f"Average Overlap Percentage (PCC): {df_pcc_comparison['OverlapPercentage'].mean():.2f}%")
print(df_pcc_comparison.head())

df_pcc_overlap_keys = pd.DataFrame(pcc_overlap_pairs)

print(f"Total overlapping (TargetUser, OtherUser) pairs (PCC): {len(df_pcc_overlap_keys)}")
print(df_pcc_overlap_keys.head())

df_pcc_overlap_details = pd.merge(
    df_pcc_overlap_keys,
    df_ds_pcc, 
    on=['TargetUser', 'OtherUser'],
    how='left'
)

overlap_pcc_output_file = '../../results/3_2_1_3_7_overlap_users_pearson.csv'
df_pcc_overlap_details.to_csv(overlap_pcc_output_file, index=False)
print(f"Intersection users (PCC) with details saved successfully to {overlap_pcc_output_file}.")
print(df_pcc_overlap_details.head())

comparison_pcc_output_file = '../../results/3_2_1_3_7_comparison_results_pearson.csv'
df_pcc_comparison.to_csv(comparison_pcc_output_file, index=False)
print("PCC comparison results saved successfully.")

Comparing top users lists (PCC)...
Average Overlap Percentage (PCC): 71.43%
       TargetUser  PCCCount  PCC_DSCount  OverlapCount  OverlapPercentage
0  A1ER5AYS3FQ9O3        21           21            15              71.43
Total overlapping (TargetUser, OtherUser) pairs (PCC): 15
       TargetUser       OtherUser
0  A1ER5AYS3FQ9O3  A2CVXUY1EYQGGA
1  A1ER5AYS3FQ9O3   A1VQHH85U7PX0
2  A1ER5AYS3FQ9O3  A2QDOJFFLFGF18
3  A1ER5AYS3FQ9O3  A19YOYY7FLQMA6
4  A1ER5AYS3FQ9O3   AWPN47SSWK1JV
Intersection users (PCC) with details saved successfully to ../../results/3_2_1_3_7_overlap_users_pearson.csv.
       TargetUser       OtherUser  PearsonSimilarity  CommonItems  \
0  A1ER5AYS3FQ9O3  A2CVXUY1EYQGGA                1.0            3   
1  A1ER5AYS3FQ9O3   A1VQHH85U7PX0                1.0            2   
2  A1ER5AYS3FQ9O3  A2QDOJFFLFGF18                1.0            2   
3  A1ER5AYS3FQ9O3  A19YOYY7FLQMA6                1.0            2   
4  A1ER5AYS3FQ9O3   AWPN47SSWK1JV                1.0      

### 3.2.1.3.8

This section compares prediction values generated using Pearson similarity vs. Discounted Pearson similarity (DS) for the same (TargetUser, Item) pairs.
  
It calculates the difference for each pair and saves the comparison results to:  
`3_2_1_3_8_pred_comparison_pearson.csv`

In [41]:
print("Comparing rating predictions (PCC)...")

df_pred_comparison_pcc = pd.merge(
    df_predictions_pcc[['TargetUser', 'Item', 'PredictedRating']],
    df_predictions_ds_pcc[['TargetUser', 'Item', 'PredictedRating']],
    on=['TargetUser', 'Item'],
    suffixes=('_PCC', '_PCC_DS'),
    how='inner'
)

df_pred_comparison_pcc['Difference'] = \
    df_pred_comparison_pcc['PredictedRating_PCC'] - df_pred_comparison_pcc['PredictedRating_PCC_DS']

print(f"Compared {len(df_pred_comparison_pcc)} common predictions (PCC).")
print(f"Average Difference: {df_pred_comparison_pcc['Difference'].abs().mean():.4f}")
print(df_pred_comparison_pcc.head())

pred_comp_pcc_output = '../../results/3_2_1_3_8_pred_comparison_pearson.csv'
df_pred_comparison_pcc.to_csv(pred_comp_pcc_output, index=False)

print("PCC predictions comparison saved successfully.")


Comparing rating predictions (PCC)...
Compared 1125 common predictions (PCC).
Average Difference: 0.0016
       TargetUser        Item  PredictedRating_PCC  PredictedRating_PCC_DS  \
0  A1ER5AYS3FQ9O3  B00MCVPIJI                  3.0                     3.0   
1  A1ER5AYS3FQ9O3  B000ND75BG                  5.0                     5.0   
2  A1ER5AYS3FQ9O3  B005ZG0IME                  2.0                     2.0   
3  A1ER5AYS3FQ9O3  B01FFRD1NU                  5.0                     5.0   
4  A1ER5AYS3FQ9O3  B00H888AMC                  4.0                     4.0   

   Difference  
0         0.0  
1         0.0  
2         0.0  
3         0.0  
4         0.0  
PCC predictions comparison saved successfully.


### 3.2.1.3.9

This section finds user pairs whose cosine similarity is positive while their Pearson similarity is negative, saves these pair-level results to:  
`3_2_1_3_9_cosine_pos_pearson_neg_pairs.csv`  

It then builds item-level details for these pairs (including ratings, pair-wise means, and deviations) and saves them to:  
`3_2_1_3_9_cosine_pos_pearson_neg_pairs_items.csv`

In [42]:
print("Finding user pairs where cosine is +ve but Pearson correlation is negative...")

COSINE_POS_THRESHOLD = 0.0 

df_merged_pcc = pd.merge(
    df_results_filtered[['TargetUser', 'OtherUser', 'Similarity', 'CommonItems']],
    df_pcc_results_filtered[['TargetUser', 'OtherUser', 'PearsonSimilarity']],
    on=['TargetUser', 'OtherUser'],
    how='inner'
)

df_cos_pos_pcc_neg_pairs = df_merged_pcc[
    (df_merged_pcc['Similarity'] > COSINE_POS_THRESHOLD) &
    (df_merged_pcc['PearsonSimilarity'] < 0)
].copy()

print(f"Found {len(df_cos_pos_pcc_neg_pairs)} user pairs with cosine > {COSINE_POS_THRESHOLD} and Pearson < 0.")
print(df_cos_pos_pcc_neg_pairs.head())

pairs_output_file = '../../results/3_2_1_3_9_cosine_pos_pearson_neg_pairs.csv'
df_cos_pos_pcc_neg_pairs.to_csv(pairs_output_file, index=False)
print(f"Cosine-positive / Pearson-negative pairs saved successfully to {pairs_output_file}.")


Finding user pairs where cosine is +ve but Pearson correlation is negative...
Found 7 user pairs with cosine > 0.0 and Pearson < 0.
        TargetUser       OtherUser  Similarity  CommonItems  PearsonSimilarity
13  A1ER5AYS3FQ9O3  A2WB7LZ595CR50        0.94            2               -1.0
24  A1ER5AYS3FQ9O3  A32O5FZH994CNY        0.98            3               -1.0
31  A1ER5AYS3FQ9O3   AMRMK86X3PKXD        0.98            2               -1.0
67  A1ER5AYS3FQ9O3  A2RUN9WBD5H23R        0.98            2               -1.0
73  A1ER5AYS3FQ9O3   AW8ESDU0C82O0        0.88            2               -1.0
Cosine-positive / Pearson-negative pairs saved successfully to ../../results/3_2_1_3_9_cosine_pos_pearson_neg_pairs.csv.


In [43]:
print("Building item-level details for cosine-positive / Pearson-negative pairs...")

pcc_flip_details = []

for _, row in df_cos_pos_pcc_neg_pairs.iterrows():
    target_user = row['TargetUser']
    other_user = row['OtherUser']
    
    t_items = user_item_ratings.get(target_user, {})
    o_items = user_item_ratings.get(other_user, {})
    
    common_items = set(t_items.keys()) & set(o_items.keys())
    if not common_items:
        continue
    
    t_common_ratings = []
    o_common_ratings = []
    for item in common_items:
        t_common_ratings.append(t_items[item])
        o_common_ratings.append(o_items[item])
    
    t_mean_pair = sum(t_common_ratings) / len(t_common_ratings)
    o_mean_pair = sum(o_common_ratings) / len(o_common_ratings)
    
    for item in common_items:
        t_rating = t_items[item]
        o_rating = o_items[item]
        
        t_dev_pair = t_rating - t_mean_pair
        o_dev_pair = o_rating - o_mean_pair
        
        pcc_flip_details.append({
            'TargetUser': target_user,
            'OtherUser': other_user,
            'ItemID': item,
            
            'CosineSimilarity': row['Similarity'],
            'PearsonSimilarity': row['PearsonSimilarity'],
            'CommonItemsCount': row['CommonItems'],
            
            'TargetRating': t_rating,
            'OtherRating': o_rating,
            
            'TargetMean_CommonItems': t_mean_pair,
            'OtherMean_CommonItems': o_mean_pair,
            
            'TargetDeviation_CommonItems': t_dev_pair,
            'OtherDeviation_CommonItems': o_dev_pair,
        })

df_pcc_flip_details = pd.DataFrame(pcc_flip_details)

print(f"Built item-level details for {len(df_pcc_flip_details)} (user, item) rows.")
print(df_pcc_flip_details.head())

pcc_flip_items_output_file = '../../results/3_2_1_3_9_cosine_pos_pearson_neg_pairs_items.csv'
df_pcc_flip_details.to_csv(pcc_flip_items_output_file, index=False)
print(f"Item-level cosine-positive / Pearson-negative details saved successfully to {pcc_flip_items_output_file}.")

Building item-level details for cosine-positive / Pearson-negative pairs...
Built item-level details for 16 (user, item) rows.
       TargetUser       OtherUser      ItemID  CosineSimilarity  \
0  A1ER5AYS3FQ9O3  A2WB7LZ595CR50  B004HKJTT2              0.94   
1  A1ER5AYS3FQ9O3  A2WB7LZ595CR50  B00L3KW09K              0.94   
2  A1ER5AYS3FQ9O3  A32O5FZH994CNY  B001AAN4PW              0.98   
3  A1ER5AYS3FQ9O3  A32O5FZH994CNY  B001AAOZHI              0.98   
4  A1ER5AYS3FQ9O3  A32O5FZH994CNY  B004L62KIO              0.98   

   PearsonSimilarity  CommonItemsCount  TargetRating  OtherRating  \
0               -1.0                 2           4.0          5.0   
1               -1.0                 2           5.0          3.0   
2               -1.0                 3           4.0          5.0   
3               -1.0                 3           4.0          5.0   
4               -1.0                 3           5.0          4.0   

   TargetMean_CommonItems  OtherMean_CommonItems  Targe