# Cell 1: Imports and Assumptions

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Cell 2: Load Data

In [10]:
def load_data(filename):
    """
    Load the CSV data into a pandas DataFrame.
    Expected columns: user_id, movie_id, rating, timestamp
    """
    # df = pd.read_csv(filename, header=None, delimiter='\t')
    df = pd.read_csv(filename, header=None)
    df.columns = ['user_id', 'movie_id', 'rating']
    df['rating'] = df['rating'].astype(int)
    return df


In [3]:
def load_new_data(filename):
    """
    Load the CSV data into a pandas DataFrame.
    Expected columns: user_id, movie_id, rating, timestamp
    """
    df = pd.read_csv(filename)
    return df


# Cell 3: Compute Global Stats and Distributions

In [4]:
def compute_global_movie_stats(df):
    """
    Compute global average rating per movie and movie frequency.
    """
    global_stats = df.groupby('movie_id')['rating'].agg(['mean', 'count']).rename(columns={'mean': 'global_mean', 'count': 'freq'})
    return global_stats

def compute_global_rating_distribution(df):
    """
    Compute global rating distribution p(r=1), p(r=2), ..., p(r=5).
    """
    rating_counts = df['rating'].value_counts()
    total = rating_counts.sum()
    dist = np.array([rating_counts.get(r, 0) for r in range(1,6)]) / total
    return dist


# Cell 4: Build User Profiles

In [5]:
def build_user_profiles(df):
    """
    Build a dictionary of user -> {movie_id: rating}
    """
    user_profiles = {}
    for row in df.itertuples(index=False):
        u = row.user_id
        m = row.movie_id
        r = row.rating
        if u not in user_profiles:
            user_profiles[u] = {}
        user_profiles[u][m] = r
    return user_profiles


# Cell 5: Feature Calculation Functions

In [6]:
def calculate_RDAM(user_profile, global_stats):
    """
    RDAM: Mean absolute deviation from global movie averages.
    """
    diffs = []
    for m, r in user_profile.items():
        if m in global_stats.index:
            diffs.append(abs(r - global_stats.loc[m, 'global_mean']))
    return np.mean(diffs) if diffs else np.nan

def calculate_WDA(user_profile, global_stats):
    """
    WDA: Weighted deviation from average, weighted by movie frequency.
    """
    weighted_diffs = []
    freqs = []
    for m, r in user_profile.items():
        if m in global_stats.index:
            diff = abs(r - global_stats.loc[m, 'global_mean'])
            f = global_stats.loc[m, 'freq']
            weighted_diffs.append(diff * f)
            freqs.append(f)
    if freqs:
        return sum(weighted_diffs) / sum(freqs)
    else:
        return np.nan

def calculate_cosine_similarity(user_profile, global_stats, all_movies):
    """
    Cosine similarity between user's rating vector and global average rating vector.
    """
    G = np.array([global_stats.loc[m, 'global_mean'] if m in global_stats.index else 0 for m in all_movies])
    U = np.array([user_profile[m] if m in user_profile else 0 for m in all_movies])
    if np.linalg.norm(U) == 0 or np.linalg.norm(G) == 0:
        return np.nan
    sim = (U.dot(G)) / (np.linalg.norm(U)*np.linalg.norm(G))
    return sim

def calculate_LenVar(user_profile):
    """
    LenVar: Variance of user's ratings.
    """
    ratings = list(user_profile.values())
    if len(ratings) > 1:
        return np.var(ratings)
    else:
        return 0.0


# Cell 6: TF-IDF Calculation

In [7]:
def calculate_TFIDF(user_profiles, all_movies):
    """
    Compute TF-IDF for each user.
    Treat each user as a 'document' and each movie as a 'term'.
    """
    user_ids = sorted(user_profiles.keys())
    # Build a binary matrix: rows = users, cols = movies
    user_movie_matrix = np.zeros((len(user_ids), len(all_movies)))
    movie_index_map = {m:i for i,m in enumerate(all_movies)}
    
    for ui, u in enumerate(user_ids):
        for m, r in user_profiles[u].items():
            user_movie_matrix[ui, movie_index_map[m]] = 1  # or use rating as TF
    
    transformer = TfidfTransformer()
    tfidf_matrix = transformer.fit_transform(user_movie_matrix)
    # Average TF-IDF per user
    avg_tfidf = tfidf_matrix.mean(axis=1).A1  # A1 converts sparse matrix to 1D array
    
    tfidf_dict = dict(zip(user_ids, avg_tfidf))
    return tfidf_dict


# Cell 7: RDMA Similarity and RDMA_LenVar

In [8]:
def calculate_RDMA_similarity(user_profile, global_dist):
    """
    RDMA similarity: Compare user's rating distribution to global rating distribution.
    """
    ratings = list(user_profile.values())
    if len(ratings) == 0:
        return np.nan
    user_dist = np.array([ratings.count(r) for r in range(1,6)]) / len(ratings)
    mad = np.mean(np.abs(user_dist - global_dist))
    return 1 - mad

def calculate_RDMA_LenVar(rdma_similarity, lenvar):
    """
    Combine RDMA similarity and LenVar.
    RDMA_LenVar = RDMA_similarity * (1/(1+lenvar))
    """
    if rdma_similarity is np.nan:
        return np.nan
    return rdma_similarity * (1/(1+lenvar))


# Cell 8: Degree of Similarity

In [9]:
def calculate_degree_of_similarity(user_profile, global_stats):
    """
    Degree of similarity: correlation between user pattern (above/below movie mean) and 
    a global pattern.
    """
    overall_avg_rating = global_stats['global_mean'].mean()
    
    user_movies = list(user_profile.keys())
    if not user_movies:
        return np.nan
    
    user_ratings = np.array([user_profile[m] for m in user_movies])
    movie_means = np.array([global_stats.loc[m, 'global_mean'] if m in global_stats.index else overall_avg_rating 
                            for m in user_movies])
    
    U_bin = (user_ratings > movie_means).astype(int)
    G_bin = (movie_means > overall_avg_rating).astype(int)
    
    if len(U_bin) > 1 and np.std(U_bin) > 0 and np.std(G_bin) > 0:
        corr = np.corrcoef(U_bin, G_bin)[0,1]
    else:
        corr = 0.0
    return corr


# Cell 9: Main Execution 

Real User: Authenticity -> 1
Fake User: Authenticity -> 0

In [17]:
# filename = 'only_rating_attack_data.csv'  # Ensure this file is in the working directory or provide a full path
# filename = './data/tool_home.csv'  
filename = './data/grocery_food.csv'  
df = load_data(filename)
df['user_id'] = df['user_id'] + 1
df['movie_id'] = df['movie_id'] + 1
print(df.head())

   user_id  movie_id  rating
0        1      7988       3
1        1      5497       5
2        1      7342       3
3        1      5993       5
4        1      8321       3


RDAM: Rating Deviation from Average Mean
WDA: Weighted Degree of Agreement
LenVar: Length Variance
RDMA_similarity: Rating Deviation Moving Average Similarity
RDMA_LenVar: Rating Deviation Moving Average Length Variance
degree_of_similarity: Degree of Similarity

In [12]:
# Adjust the filename if needed
# filename = '../data/ml100k/real_data/real_data.csv'  # Ensure this file is in the working directory or provide a full path

global_stats = compute_global_movie_stats(df)
global_dist = compute_global_rating_distribution(df)
user_profiles = build_user_profiles(df)
all_movies = sorted(df['movie_id'].unique())

# Precompute TF-IDF
tfidf_dict = calculate_TFIDF(user_profiles, all_movies)

user_features = []
for u, profile in user_profiles.items():
    rdam = calculate_RDAM(profile, global_stats)
    wda = calculate_WDA(profile, global_stats)
    cosim = calculate_cosine_similarity(profile, global_stats, all_movies)
    lenvar = calculate_LenVar(profile)
    user_tfidf = tfidf_dict[u]
    rdma_sim = calculate_RDMA_similarity(profile, global_dist)
    rdma_lenvar = calculate_RDMA_LenVar(rdma_sim, lenvar)
    deg_sim = calculate_degree_of_similarity(profile, global_stats)

    user_features.append({
        'user_id': u,
        'RDAM': rdam,
        'WDA': wda,
        'cosine_similarity': cosim,
        'LenVar': lenvar,
        'TF-IDF': user_tfidf,
        'RDMA_similarity': rdma_sim,
        'RDMA_LenVar': rdma_lenvar,
        'degree_of_similarity': deg_sim
    })

features_df = pd.DataFrame(user_features)
features_df['authenticity'] = features_df['user_id'].apply(lambda x: 0 if x > 1777 else 1)
# features_df = features_df.sample(frac=1, random_state=42).reset_index(drop=True)



In [13]:
original_user_ids = features_df['user_id'].unique()

# Step 2: Get attack and normal IDs
attack_ids = features_df[features_df['user_id'] > 1777]['user_id'].values
normal_ids = features_df[features_df['user_id'] <= 1777]['user_id'].values

# Step 3: Randomly select normal IDs to swap with
selected_normal_ids = np.random.choice(normal_ids, size=len(attack_ids), replace=False)

# Step 4: Create a mapping dictionary for the swaps
swap_dict = dict(zip(attack_ids, selected_normal_ids))
swap_dict.update(dict(zip(selected_normal_ids, attack_ids)))

# Step 5: Apply the swaps using the mapping
features_df['user_id'] = features_df['user_id'].map(lambda x: swap_dict.get(x, x))
features_df = features_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the number of unique user IDs
print(f"Number of unique user IDs: {features_df['user_id'].nunique()}")
print(f"Original unique IDs count: {len(original_user_ids)}")

Number of unique user IDs: 1041
Original unique IDs count: 1041


In [14]:
features_df.to_csv('tool_home_attack_data.csv', index=False)
features_df.head()

Unnamed: 0,user_id,RDAM,WDA,cosine_similarity,LenVar,TF-IDF,RDMA_similarity,RDMA_LenVar,degree_of_similarity,authenticity
0,137,0.956893,0.93038,0.056725,0.035665,0.000683,0.835826,0.807042,-0.460977,1
1,629,0.636426,0.755556,0.053842,0.853306,0.000615,0.917378,0.494995,-0.266667,1
2,185,0.659697,0.819444,0.043303,1.555556,0.000508,0.858583,0.335967,0.353553,1
3,32,0.408351,0.384615,0.054864,0.270833,0.000639,0.814348,0.640798,0.043644,1
4,678,0.599834,0.655556,0.049036,0.565097,0.000572,0.912886,0.583278,-0.382047,1


In [15]:
new_file = './tool_home_attack_data.csv'
new_df = load_new_data(new_file)

In [16]:
# Understanding User Column (Column 0)
unique_users = new_df['user_id'].unique()
min_value = np.min(unique_users)
max_value = np.max(unique_users)
print('Number of unique users:', len(unique_users))
print(f"Minimum value in column 0: {min_value}")
print(f"Maximum value in column 0: {max_value}")

Number of unique users: 1041
Minimum value in column 0: 1
Maximum value in column 0: 1041
