# AIE425 Intelligent Recommender Systems - Course Project
### 3.2. Part 1: PCA Method with Mean-Filling


**Name:** Menna Salem Elsayed

**ID:** 221101277

In [1]:
import pandas as pd
import numpy as np
ratings_df= pd.read_csv(r"D:\project IRS\ratings.csv")

In [2]:
# Count ratings per user and per item
user_counts = ratings_df['userId'].value_counts()
item_counts = ratings_df['movieId'].value_counts()

# Filter items/users with at least 20 ratings (quality threshold)
valid_items = item_counts[item_counts >= 20].index.tolist()
valid_users = user_counts[user_counts >= 20].index.tolist()

print(f"Valid items (≥20 ratings): {len(valid_items):,}")
print(f"Valid users (≥20 ratings): {len(valid_users):,}")

# RANDOM SAMPLE - INCREASED sizes to ensure >100K ratings
np.random.seed(42)
N_ITEMS = 800  
N_USERS = 15000 

sample_items = set(np.random.choice(valid_items, size=min(N_ITEMS, len(valid_items)), replace=False))
sample_items.add(2)     
sample_items.add(8860)  
sample_users = set(np.random.choice(valid_users, size=min(N_USERS, len(valid_users)), replace=False))

filtered_df = ratings_df[
    (ratings_df['userId'].isin(sample_users)) & 
    (ratings_df['movieId'].isin(sample_items))
]

n_ratings = len(filtered_df)
n_users = filtered_df['userId'].nunique()
n_items = filtered_df['movieId'].nunique()

print("\n" + "="*60)
print("FILTERED DATASET (Random Sample)")
print("="*60)
print(f"Ratings: {n_ratings:,} (>100K) {'✓' if n_ratings > 100000 else '✗'}")
print(f"Users: {n_users:,} (>10K) {'✓' if n_users > 10000 else '✗'}")
print(f"Items: {n_items:,} (≥500) {'✓' if n_items >= 500 else '✗'}")

Valid items (≥20 ratings): 13,132
Valid users (≥20 ratings): 138,493

FILTERED DATASET (Random Sample)
Ratings: 142,888 (>100K) ✓
Users: 14,201 (>10K) ✓
Items: 800 (≥500) ✓


## Define Target Items (I1 and I2)

In [3]:
I1 = 2     # Popular item
I2 = 8860  # Less popular item
target_items = [I1, I2]
print("="*60)
print("TARGET ITEM SELECTION")
print("="*60)

# Show popularity in original dataset
ratings_original = pd.read_csv(r'D:\project IRS\section1_statistical_analysis\ratings_statistics.csv')
I1_count = len(ratings_original[ratings_original['movieId'] == I1])
I2_count = len(ratings_original[ratings_original['movieId'] == I2])
all_counts = ratings_original['movieId'].value_counts()
I1_pct = (all_counts < I1_count).mean() * 100
I2_pct = (all_counts < I2_count).mean() * 100

print(f"\nI1 (movieId={I1}): {I1_count:,} ratings ({I1_pct:.1f}th percentile) - POPULAR")
print(f"I2 (movieId={I2}): {I2_count:,} ratings ({I2_pct:.1f}th percentile) - LESS POPULAR")


TARGET ITEM SELECTION

I1 (movieId=2): 22,243 ratings (99.5th percentile) - POPULAR
I2 (movieId=8860): 1,306 ratings (90.0th percentile) - LESS POPULAR


In [4]:
# Verify target items in filtered data
print(f"I1 ({I1}) in data: {'✓' if I1 in filtered_df['movieId'].values else '✗'}")
print(f"I2 ({I2}) in data: {'✓' if I2 in filtered_df['movieId'].values else '✗'}")

I1 (2) in data: ✓
I2 (8860) in data: ✓


In [5]:
# Ensure basic items and matrices are defined
df = filtered_df
I1, I2 = 2, 8860
target_items = [I1, I2]

# Base Rating Matrices
R = df[df['movieId'].isin(target_items)].pivot_table(index='userId', columns='movieId', values='rating')
R_all = df.pivot_table(index='userId', columns='movieId', values='rating')

In [6]:
missing_I1 = R[I1].isna().sum()
missing_I2 = R[I2].isna().sum()

print("Number of Missing Ratings:")
print(f"I1 ({I1}) Missing Ratings = {missing_I1}")
print(f"I2 ({I2}) Missing Ratings = {missing_I2}")

Number of Missing Ratings:
I1 (2) Missing Ratings = 59
I2 (8860) Missing Ratings = 2278


In [7]:
print("Number of Ratings for Each Target Item")

for item in target_items:
    num_ratings = df[df["movieId"] == item]["rating"].count()
    print(f"I{item}: Number of Ratings = {num_ratings}")

Number of Ratings for Each Target Item
I2: Number of Ratings = 2350
I8860: Number of Ratings = 131


In [8]:
users_I1 = R[I1].notna().sum()
users_I2 = R[I2].notna().sum()

print("Number of Users Who Rated Each Target Item:")
print(f"I1 ({I1}) rated by {users_I1} users")
print(f"I2 ({I2}) rated by {users_I2} users")

Number of Users Who Rated Each Target Item:
I1 (2) rated by 2350 users
I2 (8860) rated by 131 users


In [9]:
# define to use for predictions 
users_missing_I1 = R[R[I1].isna()].index.tolist()
users_missing_I2 = R[R[I2].isna()].index.tolist()

print("Users with Missing Ratings:")
print(f"\nI1 ({I1}) - Users with missing ratings:")
print(users_missing_I1)

print(f"\nI2 ({I2}) - Users with missing ratings:")
print(users_missing_I2)

Users with Missing Ratings:

I1 (2) - Users with missing ratings:
[2274, 7036, 8110, 8636, 13757, 14196, 14342, 16270, 17877, 19574, 19829, 22483, 22727, 28479, 29020, 34750, 35071, 35128, 35856, 38702, 39579, 40304, 49666, 50079, 53074, 55798, 56358, 61162, 62218, 68021, 68155, 69290, 76688, 81186, 81958, 84239, 84410, 88616, 89583, 90998, 93473, 93512, 94536, 94956, 97957, 100199, 101167, 108206, 109173, 110016, 118625, 120427, 123935, 124487, 130561, 132643, 134154, 134637, 138289]

I2 (8860) - Users with missing ratings:
[34, 170, 326, 370, 444, 457, 518, 554, 586, 648, 684, 694, 703, 725, 737, 789, 812, 847, 1136, 1181, 1255, 1302, 1313, 1339, 1374, 1516, 1525, 1568, 1579, 1643, 1672, 1743, 1825, 1887, 1927, 1959, 2155, 2306, 2487, 2671, 2676, 2705, 2755, 2807, 2839, 2875, 2925, 3170, 3192, 3264, 3318, 3348, 3352, 3405, 3450, 3457, 3483, 3596, 3639, 3646, 3663, 3792, 3935, 3948, 4083, 4172, 4229, 4244, 4392, 4410, 4418, 4525, 4528, 4532, 4533, 4549, 4574, 4577, 4581, 4600, 4605, 4

1.Mean rating for target items (I1, I2)")

In [10]:
R = (
    df[df["movieId"].isin(target_items)]
    .pivot_table(index="userId", columns="movieId", values="rating")
)

print("\n User–Item Rating Matrix (R)")
print(R.head())


 User–Item Rating Matrix (R)
movieId  2     8860
userId             
34        3.0   NaN
124       2.0   3.0
170       3.0   NaN
326       3.0   NaN
370       4.0   NaN


In [11]:
item_means_target = R.mean(axis=0)

print("\n mean Rating for Each Target Item")
print(item_means_target)


 mean Rating for Each Target Item
movieId
2       3.213404
8860    3.152672
dtype: float64


2.Mean-Filling missing ratings

In [12]:
Rf = R.fillna(item_means_target)

print("\nRatings AFTER Mean-Filling (Rf)")
print(Rf.head())
print("\nMissing values after mean-filling:")
print(Rf.isna().sum())


Ratings AFTER Mean-Filling (Rf)
movieId  2         8860
userId                 
34        3.0  3.152672
124       2.0  3.000000
170       3.0  3.152672
326       3.0  3.152672
370       4.0  3.152672

Missing values after mean-filling:
movieId
2       0
8860    0
dtype: int64


### 3. Calculate the average rating for each item

In [13]:
item_means_all = R_all.mean(axis=0)

print("Step 3: Average Rating for Each Item (Top 5):")
print(item_means_all.head())

Step 3: Average Rating for Each Item (Top 5):
movieId
2      3.213404
34     3.622133
58     3.958824
75     2.500000
106    3.227273
dtype: float64


4. Centering (Difference from Item Mean)

In [14]:
#Each value = Rating - Mean(Item)
X_centered = Rf - item_means_target

print("\n Centered Rating Matrix")
print(X_centered.head().round(2))



 Centered Rating Matrix
movieId  2     8860
userId             
34      -0.21  0.00
124     -1.21 -0.15
170     -0.21  0.00
326     -0.21  0.00
370      0.79  0.00


In [15]:
# A. Centering target items (using item means from Step 1)
X_centered = Rf - item_means_target

# B. Mean-filling and Centering full dataset (needed for Step 5)
Rf_all = R_all.fillna(item_means_all)
X_centered_all = Rf_all - item_means_all

print("Step 4: Centering completed.")
print("X_centered_all shape:", X_centered_all.shape)
print("Sample centered ratings (I1, I2):")
print(X_centered.head().round(2))

Step 4: Centering completed.
X_centered_all shape: (14201, 800)
Sample centered ratings (I1, I2):
movieId  2     8860
userId             
34      -0.21  0.00
124     -1.21 -0.15
170     -0.21  0.00
326     -0.21  0.00
370      0.79  0.00


### 5. Compute the covariance for each two items

In [16]:
N_users = X_centered_all.shape[0]

# Matrix multiplication: (Centered^T @ Centered) / (N-1)
cov_matrix_all = (X_centered_all.T @ X_centered_all) / (N_users - 1)


### 6. Generate the covariance matrix

In [17]:
print("Step 6: Covariance Matrix (Sample columns):")
print(cov_matrix_all.iloc[:5, :5])
print("Covariance Matrix Shape:", cov_matrix_all.shape)
print("\nCOVARIANCE BETWEEN I1 AND I2:")
print(cov_matrix_all.loc[[I1, I2], [I1, I2]])
print("Sub-matrix Shape:", cov_matrix_all.loc[[I1, I2], [I1, I2]].shape)

Step 6: Covariance Matrix (Sample columns):
movieId       2         34        58        75        106
movieId                                                  
2        0.147763  0.027919  0.004555  0.001036  0.000429
34       0.027919  0.298167  0.007175 -0.000425  0.000527
58       0.004555  0.007175  0.085809  0.000655  0.000680
75       0.001036 -0.000425  0.000655  0.007535  0.000000
106      0.000429  0.000527  0.000680  0.000000  0.003006
Covariance Matrix Shape: (800, 800)

COVARIANCE BETWEEN I1 AND I2:
movieId      2         8860
movieId                    
2        0.147763  0.001742
8860     0.001742  0.007567
Sub-matrix Shape: (2, 2)


### 7. Determine the top 5-peers and top 10-peers for each of the target items (I1 and I2)

In [18]:
def get_top_peers(cov_matrix, target_item, top_k):
    # Get covariance values for the target item
    cov_values = cov_matrix[target_item].copy()
    # Remove self-covariance
    cov_values = cov_values.drop(target_item)
    # Sort by similarity (covariance) descending
    top_peers = cov_values.sort_values(ascending=False).head(top_k)
    return top_peers

top5_I1 = get_top_peers(cov_matrix_all, I1, 5)
top10_I1 = get_top_peers(cov_matrix_all, I1, 10)
top5_I2 = get_top_peers(cov_matrix_all, I2, 5)
top10_I2 = get_top_peers(cov_matrix_all, I2, 10)

print(f"\nTop 5 Peers for I1 (movieId={I1}): {list(top5_I1.index)}")
print(f"Top 10 Peers for I1 (movieId={I1}): {list(top10_I1.index)}")
print(f"\nTop 5 Peers for I2 (movieId={I2}): {list(top5_I2.index)}")
print(f"Top 10 Peers for I2 (movieId={I2}): {list(top10_I2.index)}")


Top 5 Peers for I1 (movieId=2): [586, 34, 788, 1097, 410]
Top 10 Peers for I1 (movieId=2): [586, 34, 788, 1097, 410, 2571, 256, 2028, 919, 2424]

Top 5 Peers for I2 (movieId=8860): [2, 2710, 1097, 3175, 2571]
Top 10 Peers for I2 (movieId=8860): [2, 2710, 1097, 3175, 2571, 7004, 1784, 256, 2699, 1407]


## PCA  for Steps 8-11
 define a function that implements the PCA Method by:
1. Selecting a subset of items (target + peers).
2. Performing eigen-decomposition on the sub-covariance matrix.
3. Projecting users into this reduced space and reconstructing the ratings.

In [19]:
import scipy.linalg as linalg

def pca_predict_ratings(X_cent_all, cov_mat_all, item_means_all, target, peers, var_thresh=0.9):
    subset = [target] + list(peers.index)
    # Get existing sub-covariance
    cov_subset = cov_mat_all.loc[subset, subset].values
    
    # Eigen-decomposition
    vals, vecs = linalg.eigh(cov_subset)
    idx = np.argsort(vals)[::-1]
    vals, vecs = vals[idx], vecs[:, idx]
    
    # Select components based on variance threshold
    cum_var = np.cumsum(vals) / np.sum(vals)
    n_components = max(1, np.argmax(cum_var >= var_thresh) + 1)
    
    # Projection matrix (top n_components vectors)
    V_k = vecs[:, :n_components]
    
    # Project centered data and reconstruct
    X_sub = X_cent_all[subset].values
    X_recon = (X_sub @ V_k) @ V_k.T
    
    # Add back the target item mean
    target_mean = item_means_all[target]
    # Target item is at index 0 of subset
    preds = X_recon[:, 0] + target_mean
    
    return pd.Series(preds, index=X_cent_all.index).clip(1, 5), n_components

## Eigenvalues and Eigenvectors 
the eigenvalues and eigenvectors for our target items (using their top 5 peers).

In [20]:
def show_eigen_details(target, peers, cov_matrix):
    subset = [target] + list(peers.index)
    cov_subset = cov_matrix.loc[subset, subset]
    
    # Calculate Eigenvalues and Eigenvectors
    vals, vecs = np.linalg.eigh(cov_subset.values)
    
    # Sort in descending order
    idx = np.argsort(vals)[::-1]
    vals = vals[idx]
    vecs = vecs[:, idx]

    
    print("\n1. Eigenvalues (Explained Variance):")
    for i, v in enumerate(vals):
        print(f"   Component {i+1}: {v:.6f}")
        
    print("\n2. Explained Variance Ratio:")
    ratio = vals / np.sum(vals)
    for i, r in enumerate(ratio):
        print(f"   Component {i+1}: {r:.2%} (Cumulative: {np.sum(ratio[:i+1]):.2%})")
        
    print("\n3. Top Eigenvector (Principal Component 1) Loadings:")
    top_vec = pd.Series(vecs[:, 0], index=subset)
    print(top_vec)

# Show for I1 and I2
show_eigen_details(I1, top5_I1, cov_matrix_all)
show_eigen_details(I2, top5_I2, cov_matrix_all)


1. Eigenvalues (Explained Variance):
   Component 1: 0.355729
   Component 2: 0.232314
   Component 3: 0.192486
   Component 4: 0.152035
   Component 5: 0.130533
   Component 6: 0.118519

2. Explained Variance Ratio:
   Component 1: 30.11% (Cumulative: 30.11%)
   Component 2: 19.66% (Cumulative: 49.77%)
   Component 3: 16.29% (Cumulative: 66.06%)
   Component 4: 12.87% (Cumulative: 78.92%)
   Component 5: 11.05% (Cumulative: 89.97%)
   Component 6: 10.03% (Cumulative: 100.00%)

3. Top Eigenvector (Principal Component 1) Loadings:
2       0.235841
586     0.385058
34      0.739729
788     0.213479
1097    0.428565
410     0.140246
dtype: float64

1. Eigenvalues (Explained Variance):
   Component 1: 0.322597
   Component 2: 0.233192
   Component 3: 0.208336
   Component 4: 0.142887
   Component 5: 0.098059
   Component 6: 0.007522

2. Explained Variance Ratio:
   Component 1: 31.86% (Cumulative: 31.86%)
   Component 2: 23.03% (Cumulative: 54.89%)
   Component 3: 20.57% (Cumulative: 75.4

### Steps 8 & 9: Reduced Space and Predictions using TOP 5-PEERS

In [21]:
print("Computing PCA predictions using Top 5 peers...")
pred_I1_5, k1_5 = pca_predict_ratings(X_centered_all, cov_matrix_all, item_means_all, I1, top5_I1)
pred_I2_5, k2_5 = pca_predict_ratings(X_centered_all, cov_matrix_all, item_means_all, I2, top5_I2)

print(f"I1 (5 peers): {k1_5} components kept.")
print(f"I2 (5 peers): {k2_5} components kept.")

print("\nSample Predictions for users missing I1 (first 10):")
print(pred_I1_5.loc[users_missing_I1[:10]])

print("\nSample Predictions for users missing I2 (first 10):")
print(pred_I2_5.loc[users_missing_I2[:10]])

Computing PCA predictions using Top 5 peers...
I1 (5 peers): 6 components kept.
I2 (5 peers): 5 components kept.

Sample Predictions for users missing I1 (first 10):
userId
2274     3.213404
7036     3.213404
8110     3.213404
8636     3.213404
13757    3.213404
14196    3.213404
14342    3.213404
16270    3.213404
17877    3.213404
19574    3.213404
dtype: float64

Sample Predictions for users missing I2 (first 10):
userId
34     3.150401
170    3.154017
326    3.150401
370    3.163050
444    3.150401
457    3.152441
518    3.161041
554    3.148078
586    3.147128
648    3.139863
dtype: float64


## Eigenvalues and Eigenvectors 
the eigenvalues and eigenvectors for our target items (using their top 10 peers).

In [22]:
def show_eigen_details(target, peers, cov_matrix):
    subset = [target] + list(peers.index)
    cov_subset = cov_matrix.loc[subset, subset]
    
    # Calculate Eigenvalues and Eigenvectors
    vals, vecs = np.linalg.eigh(cov_subset.values)
    
    # Sort in descending order
    idx = np.argsort(vals)[::-1]
    vals = vals[idx]
    vecs = vecs[:, idx]

    
    print("\n1. Eigenvalues (Explained Variance):")
    for i, v in enumerate(vals):
        print(f"   Component {i+1}: {v:.6f}")
        
    print("\n2. Explained Variance Ratio:")
    ratio = vals / np.sum(vals)
    for i, r in enumerate(ratio):
        print(f"   Component {i+1}: {r:.2%} (Cumulative: {np.sum(ratio[:i+1]):.2%})")
        
    print("\n3. Top Eigenvector (Principal Component 1) Loadings:")
    top_vec = pd.Series(vecs[:, 0], index=subset)
    print(top_vec)

# Show for I1 and I2
show_eigen_details(I1, top10_I1, cov_matrix_all)
show_eigen_details(I2, top10_I2, cov_matrix_all)


1. Eigenvalues (Explained Variance):
   Component 1: 0.411854
   Component 2: 0.304454
   Component 3: 0.231525
   Component 4: 0.208348
   Component 5: 0.192384
   Component 6: 0.152848
   Component 7: 0.148247
   Component 8: 0.130934
   Component 9: 0.118524
   Component 10: 0.093400
   Component 11: 0.077523

2. Explained Variance Ratio:
   Component 1: 19.90% (Cumulative: 19.90%)
   Component 2: 14.71% (Cumulative: 34.60%)
   Component 3: 11.18% (Cumulative: 45.79%)
   Component 4: 10.06% (Cumulative: 55.85%)
   Component 5: 9.29% (Cumulative: 65.15%)
   Component 6: 7.38% (Cumulative: 72.53%)
   Component 7: 7.16% (Cumulative: 79.69%)
   Component 8: 6.33% (Cumulative: 86.02%)
   Component 9: 5.73% (Cumulative: 91.74%)
   Component 10: 4.51% (Cumulative: 96.25%)
   Component 11: 3.75% (Cumulative: 100.00%)

3. Top Eigenvector (Principal Component 1) Loadings:
2       0.215705
586     0.340738
34      0.514688
788     0.198879
1097    0.414239
410     0.114734
2571    0.414362
25

### Steps 10 & 11: Reduced Space and Predictions using TOP 10-PEERS

In [23]:
print("Computing PCA predictions using Top 10 peers...")
pred_I1_10, k1_10 = pca_predict_ratings(X_centered_all, cov_matrix_all, item_means_all, I1, top10_I1)
pred_I2_10, k2_10 = pca_predict_ratings(X_centered_all, cov_matrix_all, item_means_all, I2, top10_I2)

print(f"I1 (10 peers): {k1_10} components kept.")
print(f"I2 (10 peers): {k2_10} components kept.")

print("\nSample Predictions for users missing I1 (first 10):")
print(pred_I1_10.loc[users_missing_I1[:10]])

print("\nSample Predictions for users missing I2 (first 10):")
print(pred_I2_10.loc[users_missing_I2[:10]])

Computing PCA predictions using Top 10 peers...
I1 (10 peers): 9 components kept.
I2 (10 peers): 8 components kept.

Sample Predictions for users missing I1 (first 10):
userId
2274     3.201686
7036     2.881635
8110     3.213404
8636     3.255219
13757    3.241231
14196    2.753694
14342    3.174535
16270    3.620249
17877    3.210064
19574    3.295187
dtype: float64

Sample Predictions for users missing I2 (first 10):
userId
34     3.150415
170    3.152664
326    3.150415
370    3.162338
444    3.150415
457    3.152759
518    3.160988
554    3.170800
586    3.139345
648    3.142039
dtype: float64


### 12. Compare results of Point 9 with Point 11

In [24]:
import pandas as pd

def summarize_preds(p5, p10, missing_idx):
    s5 = p5.loc[missing_idx]
    s10 = p10.loc[missing_idx]
    return pd.DataFrame({
        'Mean (5-Peers)': s5.mean(),
        'Std (5-Peers)': s5.std(),
        'Mean (10-Peers)': s10.mean(),
        'Std (10-Peers)': s10.std()
    }, index=['Stats']).T

print("\nComparison for I1 (movieId=2):")
print(summarize_preds(pred_I1_5, pred_I1_10, users_missing_I1))

print("\nComparison for I2 (movieId=8860):")
print(summarize_preds(pred_I2_5, pred_I2_10, users_missing_I2))



Comparison for I1 (movieId=2):
                        Stats
Mean (5-Peers)   3.213404e+00
Std (5-Peers)    4.440892e-16
Mean (10-Peers)  3.204461e+00
Std (10-Peers)   9.853429e-02

Comparison for I2 (movieId=8860):
                    Stats
Mean (5-Peers)   3.152432
Std (5-Peers)    0.013494
Mean (10-Peers)  3.152475
Std (10-Peers)   0.015677


# Key Findings

- Mean-filling successfully handled missing values.

- The covariance matrix 

- PCA components kept important information while reducing dimensions.
-  calulate eigen valaues and eigen vectors.

- Using Top-5 peers to make prediction (I1: → Mean = 3.213, Std )& (I2: → Mean = 3.1524)

- Using Top-10 peers gave more predictions (I1: → Mean = 3.2044, Std = 1.050) & (I2: → Mean = 3.1524)