In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


In [12]:
DATASET_PATH= 'C:/Users/Nour/Documents/VSCODE/IRS/irs_ass/dataset/Dianping_SocialRec_2015/rating.txt'
RESULTS_DIR = 'C:/Users/Nour/Documents/VSCODE/IRS/irs_ass/results'

In [13]:
column_names = ['user', 'item', 'rating', 'date']

# Read the rating.txt file into a DataFrame
df = pd.read_csv(DATASET_PATH, sep='|', names=column_names)
df.head()

Unnamed: 0,user,item,rating,date
0,59708,0,3,2012-06-16
1,3781,1,4,2009-12-24
2,120358,2,4,2010-08-06
3,55553,3,5,2013-07-28
4,20837,4,4,2012-03-10


In [14]:
print("Preprocessing...")
# Drop timestamp
df = df.drop(columns=['date'])
df = df[df['rating'] != 0]

Preprocessing...


In [15]:
# 3. Calculate number of ratings for each user (n_u)
user_counts = df.groupby('user')['rating'].count().rename('n_u')
user_counts.to_csv(os.path.join(RESULTS_DIR, 'n_u.csv'), header=True)

# 4. Calculate number of ratings for each item (n_i)
item_counts = df.groupby('item')['rating'].count().rename('n_i')
item_counts.to_csv(os.path.join(RESULTS_DIR, 'n_i.csv'), header=True)

# 5. Compute average ratings per user (r_u_bar)
user_means = df.groupby('user')['rating'].mean().rename('r_u_bar')
user_means.to_csv(os.path.join(RESULTS_DIR, 'r_u.csv'), header=True)

# 6. Compute average ratings per item (r_i_bar)
item_means = df.groupby('item')['rating'].mean().rename('r_i_bar')
item_means.to_csv(os.path.join(RESULTS_DIR, 'r_i.csv'), header=True)

In [16]:
df.groupby('item').count().sort_values(by='rating', ascending=False)

Unnamed: 0_level_0,user,rating
item,Unnamed: 1_level_1,Unnamed: 2_level_1
41,5960,5960
507,5390,5390
581,5009,5009
66,4610,4610
1022,4332,4332
...,...,...
6192,1,1
10896,1,1
10903,1,1
10906,1,1


In [17]:
# 7. Ascendingly order the total number of ratings per item and plot the distribution per item.
sorted_item_counts = item_counts.sort_values(ascending=True)
# Correct (Ascending)

plt.figure(figsize=(10, 6))
# Reset index to get a range for x-axis (0 to n_items) representing the items
plt.plot(range(len(sorted_item_counts)), sorted_item_counts.values)
plt.xlabel('Items (sorted by popularity)')
plt.ylabel('Number of Ratings')
plt.title('Distribution of Ratings per Item (Ascending)')
plt.grid(True)
plt.savefig(os.path.join(RESULTS_DIR, 'long_tail_plot.png'))
plt.close()
print("Plot saved to results/long_tail_plot.png")

Plot saved to results/long_tail_plot.png


In [18]:
max_rating = 5
# Define bin edges as percentages of max_rating
bin_percentages = [0, 0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 1.00]
bins = [p * max_rating for p in bin_percentages]
labels = ['G1', 'G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'G10']

# Use pd.cut to bin the item means
# include_lowest=True ensures that the first bin includes the left edge (0)
item_groups = pd.cut(item_means, bins=bins, labels=labels, include_lowest=True)

# Count number of products in each group
group_counts = item_groups.value_counts().sort_index()

print("\nNumber of products per group:")
print(group_counts)
group_counts.to_csv(os.path.join(RESULTS_DIR, 'group_counts.csv'), header=True)


Number of products per group:
r_i_bar
G1        0
G2        0
G3        0
G4        9
G5        2
G6       26
G7       69
G8      486
G9     2646
G10    7885
Name: count, dtype: int64


In [19]:
# 9. Compute the total number of ratings in each group and order them ascendingly
# item_counts has the number of ratings per item, item_groups has the group assignment
item_data = pd.DataFrame({'group': item_groups, 'n_i': item_counts})
# Sum n_i for each group
group_ratings = item_data.groupby('group')['n_i'].sum()

# Order them ascendingly (by total ratings)
sorted_group_ratings = group_ratings.sort_values(ascending=True)

print("\nTotal ratings per group (sorted):")
print(sorted_group_ratings)
sorted_group_ratings.to_csv(os.path.join(RESULTS_DIR, 'group_ratings_sorted.csv'), header=True)

# Plot distribution
plt.figure(figsize=(10, 6))
sorted_group_ratings.plot(kind='bar')
plt.xlabel('Groups (sorted by total ratings)')
plt.ylabel('Total Number of Ratings')
plt.title('Distribution of Ratings per Group (Ascending)')
plt.grid(axis='y')
plt.savefig(os.path.join(RESULTS_DIR, 'ratings_per_group.png'))
plt.close()
print("Plot saved to results/ratings_per_group.png")



Total ratings per group (sorted):
group
G1           0
G2           0
G3           0
G5           6
G4          14
G6         161
G7        1008
G8       15847
G9      343083
G10    1789536
Name: n_i, dtype: int64
Plot saved to results/ratings_per_group.png


  group_ratings = item_data.groupby('group')['n_i'].sum()


In [None]:
# 9. Compute the total number of ratings in each group before sorting
# item_counts has the number of ratings per item, item_groups has the group assignment
item_data = pd.DataFrame({'group': item_groups, 'n_i': item_counts})
# Sum n_i for each group
group_ratings = item_data.groupby('group')['n_i'].sum()

print("\nTotal ratings per group (unsorted):")
print(group_ratings)
group_ratings.to_csv(os.path.join(RESULTS_DIR, 'group_ratings_unsorted.csv'), header=True)

# Plot distribution BEFORE sorting
plt.figure(figsize=(10, 6))
group_ratings.plot(kind='bar')  # Changed from sorted_group_ratings to group_ratings
plt.xlabel('Groups (unsorted)')
plt.ylabel('Total Number of Ratings')
plt.title('Distribution of Ratings per Group (unsorted)')
plt.grid(axis='y')
plt.savefig(os.path.join(RESULTS_DIR, 'ratings_per_group_unsorted.png'))
plt.close()
print("Plot saved to results/ratings_per_group_unsorted.png")


Total ratings per group (unsorted):
group
G1           0
G2           0
G3           0
G4          14
G5           6
G6         161
G7        1008
G8       15847
G9      343083
G10    1789536
Name: n_i, dtype: int64
Plot saved to results/ratings_per_group_unsorted.png


  group_ratings = item_data.groupby('group')['n_i'].sum()


In [20]:
n_items = df['item'].nunique()
t1 = 0.02 * n_items
t2 = 0.05 * n_items
t3 = 0.10 * n_items

print(f"\nTotal items: {n_items}")

# Filter users
u1_candidates = user_counts[user_counts <= t1]
u2_candidates = user_counts[(user_counts > t1) & (user_counts <= t2)]
u3_candidates = user_counts[(user_counts > t2) & (user_counts <= t3)]

# Select one random user from each group if available
np.random.seed(42) # For reproducibility

def get_random_user(candidates, label):
    if not candidates.empty:
        user = candidates.sample(n=1, random_state=42).index[0]
        print(f"Selected {label}: {user} (Ratings: {candidates[user]})")
        return user
    else:
        print(f"No candidates for {label}")
        return None

u1 = get_random_user(u1_candidates, "U1")
u2 = get_random_user(u2_candidates, "U2")
u3 = get_random_user(u3_candidates, "U3")
# 12. Select two target items: items with ratings between 0.5% and 1% of users
n_users = df['user'].nunique()
it1 = 0.005 * n_users
it2 = 0.01 * n_users

print(f"\nTotal users: {n_users}")
print(f"Item Thresholds: IT1={it1:.2f}, IT2={it2:.2f}")

# Filter items based on popularity (n_i)
# item_counts contains n_i for each item
target_item_candidates = item_counts[(item_counts > it1) & (item_counts <= it2)]

if not target_item_candidates.empty:
    # Select two random items
    if len(target_item_candidates) >= 2:
        selected_items = target_item_candidates.sample(n=2, random_state=42)
        i1 = selected_items.index[0]
        i2 = selected_items.index[1]
        print(f"\nSelected Target Items (Popularity 1-2% of users):")
        print(f"I1: {i1} (Ratings: {selected_items[i1]})")
        print(f"I2: {i2} (Ratings: {selected_items[i2]})")
    else:
        print("Not enough items in the 1-2% popularity range to select 2.")
        # Fallback or handle appropriately - for now just take what we have or None
        i1 = target_item_candidates.index[0]
        i2 = None
        print(f"I1: {i1}")
else:
    print("No items found in the 1-2% popularity range.")
    i1 = None
    i2 = None

# Save selected targets to a file for reference
with open(os.path.join(RESULTS_DIR, 'target_users.txt'), 'w') as f:
    f.write(f"{u1}\n")
    f.write(f"{u2}\n")
    f.write(f"{u3}\n")

with open(os.path.join(RESULTS_DIR, 'target_items.txt'), 'w') as f:
    f.write(f"{i1}\n")
    f.write(f"{i2}\n")


Total items: 11123
Selected U1: 134471 (Ratings: 11)
Selected U2: 27768 (Ratings: 293)
Selected U3: 16157 (Ratings: 626)

Total users: 147914
Item Thresholds: IT1=739.57, IT2=1479.14

Selected Target Items (Popularity 1-2% of users):
I1: 780 (Ratings: 1019)
I2: 2185 (Ratings: 1038)


In [21]:
# 13. Count the number of co-rating users and co-rated items
# 14. Determine the threshold beta

print("\nStarting Steps 13 & 14...")

# Precompute user_items and item_users maps for efficiency
# Group by user and collect items into a set
user_items = df.groupby('user')['item'].apply(set).to_dict()
# Group by item and collect users into a set
item_users = df.groupby('item')['user'].apply(set).to_dict()

target_users = [u for u in [u1, u2, u3] if u is not None]
target_items_list = [i for i in [i1, i2] if i is not None]

results_13_14 = []

print("\n--- Target Users Analysis ---")
for u_target in target_users:
    target_items_set = user_items.get(u_target, set())
    n_target_ratings = len(target_items_set)
    
    no_common_users = 0
    beta_count = 0
    threshold_30 = 0.30 * n_target_ratings
    
    # Iterate over all other users
    for u_other, other_items_set in user_items.items():
        if u_other == u_target:
            continue
            
        # Intersection size
        intersection_size = len(target_items_set.intersection(other_items_set))
        
        if intersection_size > 0:
            no_common_users += 1
            
        # Step 14 check
        if intersection_size >= threshold_30:
            beta_count += 1
            
    print(f"User {u_target}: Ratings={n_target_ratings}, No_common_users={no_common_users}, Beta (>=30% overlap)={beta_count}")
    results_13_14.append({'Type': 'User', 'ID': u_target, 'Count': no_common_users, 'Beta': beta_count})

print("\n--- Target Items Analysis ---")
for i_target in target_items_list:
    target_users_set = item_users.get(i_target, set())
    
    no_corated_items = 0
    
    # Iterate over all other items
    for i_other, other_users_set in item_users.items():
        if i_other == i_target:
            continue
            
        intersection_size = len(target_users_set.intersection(other_users_set))
        
        if intersection_size > 0:
            no_corated_items += 1
            
    print(f"Item {i_target}: No_coRated_items={no_corated_items}")
    results_13_14.append({'Type': 'Item', 'ID': i_target, 'Count': no_corated_items, 'Beta': 'N/A'})

# Save results
pd.DataFrame(results_13_14).to_csv(os.path.join(RESULTS_DIR, 'results_13_14.csv'), index=False)


Starting Steps 13 & 14...

--- Target Users Analysis ---
User 134471: Ratings=11, No_common_users=9747, Beta (>=30% overlap)=15
User 27768: Ratings=293, No_common_users=62863, Beta (>=30% overlap)=0
User 16157: Ratings=626, No_common_users=77177, Beta (>=30% overlap)=0

--- Target Items Analysis ---
Item 780: No_coRated_items=7733
Item 2185: No_coRated_items=7379
