<a href="https://colab.research.google.com/github/kojomensahonums/AI4Startups-datathon/blob/master/afrimash_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import random

In [None]:
# ============================================
# 1. LOAD DATA
# ============================================
transactions = pd.read_excel("/content/trans_subset_exploded.xlsx")
rfm = pd.read_excel("/content/RFM_Data.xlsx")

# Clean product text
transactions["Product(s)"] = transactions["Product(s)"].str.replace(r"\d+×", "", regex=True).str.strip()
transactions["Product(s)"] = transactions["Product(s)"].str.replace(r"\s+", " ", regex=True)
transactions = transactions[transactions["Product(s)"].astype(str).str.strip().ne("")].dropna(subset=["Product(s)"])

In [None]:
# ============================================
# 2. BUILD PRODUCT CONTENT SIMILARITY
# ============================================
# Clean product text
transactions["Product(s)"] = transactions["Product(s)"].astype(str)
transactions["Product(s)"] = transactions["Product(s)"].str.replace(r"\d+×", "", regex=True).str.strip()
transactions["Product(s)"] = transactions["Product(s)"].str.replace(r"\s+", " ", regex=True)

# Drop rows with empty or missing product names
transactions = transactions[transactions["Product(s)"].notna()]
transactions = transactions[transactions["Product(s)"].str.strip() != ""]

# Model
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(transactions["Product(s)"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Product lookup
product_indices = pd.Series(transactions.index, index=transactions["Product(s)"]).drop_duplicates()

def get_content_recommendations(product_name, top_n=10):
    # Skip invalid product names
    if pd.isna(product_name) or product_name.strip() == "":
        return []

    # If product not in index, skip
    if product_name not in product_indices:
        return []

    # Get the index (handle duplicates safely)
    idx = product_indices[product_name]
    if isinstance(idx, (pd.Series, list, np.ndarray)):
        idx = idx.iloc[0] if isinstance(idx, pd.Series) else idx[0]

    # Compute similarity
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # skip the item itself

    # Get product names of top similar items
    product_indices_top = [i[0] for i in sim_scores]
    return transactions.iloc[product_indices_top]["Product(s)"].tolist()



In [None]:
# ============================================
# 3. BUILD COLLABORATIVE FILTERING
# ============================================
# Create product basket per order
basket = transactions.groupby(['Order #'])['Product(s)'].apply(list).reset_index()

# Build co-occurrence matrix
from collections import defaultdict
co_matrix = defaultdict(lambda: defaultdict(int))

for row in basket['Product(s)']:
    for i in range(len(row)):
        for j in range(i+1, len(row)):
            co_matrix[row[i]][row[j]] += 1
            co_matrix[row[j]][row[i]] += 1

def get_collab_recommendations(product_name, top_n=10):
    related = co_matrix.get(product_name, {})
    sorted_related = sorted(related.items(), key=lambda x: x[1], reverse=True)
    return [k for k, _ in sorted_related[:top_n]]


In [None]:
# ============================================
# 4. HYBRID RECOMMENDER
# ============================================
# ============================================
# 4. HYBRID RECOMMENDER (Optimized & Personalized)
# ============================================
def hybrid_recommend(customer_id, top_n=10):
    """
    Returns top N recommended products for a given customer.
    Hybrid approach combining content + collaborative + recency weighting.
    """

    # === 1. Handle case where customer doesn't exist in transactions ===
    if customer_id not in transactions["Customer_ID"].unique():
        popular_products = transactions["Product(s)"].dropna().value_counts().index.tolist()
        if len(popular_products) >= top_n:
            return random.sample(popular_products[:30], k=min(top_n, 10))
        return popular_products[:top_n]

    # === 2. Get customer type and recent purchase history ===
    cust_type = rfm.loc[rfm["Customer_ID"] == customer_id, "Customer_Type"].values
    cust_type = cust_type[0] if len(cust_type) > 0 else "new"

    cust_orders = transactions[transactions["Customer_ID"] == customer_id].copy()
    cust_orders = cust_orders.sort_values("datetime", ascending=False)
    purchased = cust_orders["Product(s)"].dropna().unique()

    # === 3. Compute recency weights (recent products = higher influence) ===
    recency_weights = {
        prod: 1.0 - (i / max(len(cust_orders), 1)) * 0.5
        for i, prod in enumerate(cust_orders["Product(s)"])
    }

    recommendations = {}

    # === 4. Generate product-level recommendations ===
    for prod in purchased:
        if not isinstance(prod, str) or prod.strip().lower() == 'nan':
            continue

        # Content-based + collaborative
        content_recs = get_content_recommendations(prod, top_n)
        collab_recs = get_collab_recommendations(prod, top_n)

        # Clean invalids
        content_recs = [r for r in content_recs if isinstance(r, str) and r.strip().lower() != 'nan']
        collab_recs = [r for r in collab_recs if isinstance(r, str) and r.strip().lower() != 'nan']

        # Weight by customer type
        if cust_type == "new":
            combined = content_recs[:6] + collab_recs[:4]
        else:
            combined = collab_recs[:6] + content_recs[:4]

        # Add recency-based weight
        weight = recency_weights.get(prod, 1.0)
        for r in combined:
            if r not in purchased:
                recommendations[r] = recommendations.get(r, 0) + weight

    # === 5. Sort recommendations by score ===
    ranked = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    top_items = [r[0] for r in ranked[:top_n]]

    # === 6. Diversity control: avoid duplicates of same product family ===
    diversified = []
    for item in top_items:
        # Skip duplicates of the same leading keyword
        if not any(item.split()[0].lower() in x.lower() for x in diversified):
            diversified.append(item)
    top_items = diversified[:top_n]

    # === 7. Fallback if not enough unique recommendations ===
    if len(top_items) < top_n:
        popular_products = transactions["Product(s)"].dropna().value_counts().index.tolist()
        for prod in popular_products:
            if prod not in purchased and prod not in top_items:
                top_items.append(prod)
            if len(top_items) >= top_n:
                break

    # === 8. Final cleanup ===
    top_items = [r for r in top_items if isinstance(r, str) and r.strip().lower() != 'nan']

    return top_items[:top_n]


In [None]:
hybrid_recommend('CUS4200075', top_n=10)

['Pigs',
 'Commercial Day Old Arbor Acres Plus Broilers Chicks (Grinphield Brand)',
 'Poultry Management For First-Timers (E-Book)',
 'Commercial SAYED Broilers (Day-Old Chicks)',
 'Day-Old Marshall Broilers (Grinphield)',
 'AMO Broilers (Commercial Arbor Acres Plus)',
 'Commercial Day-Old Arbor Acres Plus (FIDAN Broilers)',
 'Agrited Broilers',
 'Commercial Day-Old ZARTECH Broilers (Cobb 500)',
 'Agrited Broiler | Commercial Day Old Broilers Ross 308']

In [None]:
# ============================================
# 6. (OPTIONAL) LINK IMAGES
# ============================================
# To activate later, uncomment this section and supply image links
# product_images = pd.read_excel("product_images.xlsx")  # columns: Product, Image_URL
# recommendations_df = recommendations_df.explode("Recommended_Products")
# recommendations_df = recommendations_df.merge(product_images,
#                                               left_on="Recommended_Products",
#                                               right_on="Product",
#                                               how="left")

# ============================================
# 7. SAVE OUTPUT
# ============================================
recommendations_df.to_csv("recommendations.csv", index=False)
print("✅ Recommendations generated and saved to recommendations.csv")
