In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Top-N Popular Products by Age Group and Product Category

import pandas as pd

# Load data
transactions = pd.read_csv("/content/drive/MyDrive/Final Project/transactions_train.csv")
customers = pd.read_csv("/content/drive/MyDrive/Final Project/customers.csv")
articles = pd.read_csv("/content/drive/MyDrive/Final Project/articles.csv")

# Merge transaction data with customer age, gender, and article category
data = (
    transactions
    .merge(customers[['customer_id', 'age']], on='customer_id', how='left')
    .merge(articles[['article_id', 'product_group_name', 'index_group_name']], on='article_id', how='left')
)

# Define age bins and labels
date_bins = [0, 17, 35, 50, 100]
age_labels = ['Under_18', '18_35', '36_50', '51_plus']
data['age_group'] = pd.cut(data['age'], bins=date_bins, labels=age_labels, right=True)

# Function to get top-N articles for any grouping
def get_top_articles(df, group_field, group_value, top_n=12):
    subset = df[df[group_field] == group_value]
    top_ids = subset['article_id'].value_counts().head(top_n).index.tolist()
    return top_ids

# Compute top-N per age group
top_by_age = {
    age_group: get_top_articles(data, 'age_group', age_group)
    for age_group in age_labels
}

# Compute top-N per product category
product_groups = data['product_group_name'].dropna().unique()
top_by_product = {
    product: get_top_articles(data, 'product_group_name', product)
    for product in product_groups
}

# Compute top-N per gender (index_group_name)
gender_groups = data['index_group_name'].dropna().unique()
top_by_gender = {
    gender: get_top_articles(data, 'index_group_name', gender)
    for gender in gender_groups
}

# Combine results into a single CSV
records = []
for age_group, article_ids in top_by_age.items():
    rec_str = ' '.join(map(str, article_ids))
    records.append({'group_type': 'age_group', 'group_value': age_group, 'prediction': rec_str})

for product, article_ids in top_by_product.items():
    rec_str = ' '.join(map(str, article_ids))
    records.append({'group_type': 'product_group_name', 'group_value': product, 'prediction': rec_str})

for gender, article_ids in top_by_gender.items():
    rec_str = ' '.join(map(str, article_ids))
    records.append({'group_type': 'gender', 'group_value': gender, 'prediction': rec_str})

# Save combined CSV
combined_df = pd.DataFrame(records)
combined_df.to_csv('top_n_combined_by_age_product_gender.csv', index=False)