# Data Simulation and Modelling

In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize random seed and Faker for reproducibility
random.seed(42)
np.random.seed(42)
fake = Faker()

# Constants
PLATFORMS = ['Instagram', 'YouTube', 'Twitter']
CATEGORIES = ['Fitness', 'Lifestyle', 'Nutrition', 'Wellness']
GENDERS = ['Male', 'Female', 'Other']
PRODUCTS = ['Protein Powder', 'Vitamins', 'Creatine', 'Gainers']
CAMPAIGNS = ['Summer Shred', 'Winter Bulk', 'Healthy Start', 'New Year Fit']
BRANDS = ['MuscleBlaze', 'HKVitals', 'Gritzo']

# --- STEP 1: Simulate Influencers ---
num_influencers = 100
influencer_data = []
for i in range(1, num_influencers + 1):
    followers = np.random.randint(5000, 1000000)
    tier = (
        'Nano' if followers < 10000 else
        'Micro' if followers < 100000 else
        'Macro' if followers < 500000 else 'Mega'
    )
    influencer_data.append({
        'ID': i,
        'name': fake.name(),
        'category': random.choice(CATEGORIES),
        'gender': random.choice(GENDERS),
        'follower_count': followers,
        'tier': tier,
        'platform': random.choice(PLATFORMS)
    })

influencers_df = pd.DataFrame(influencer_data)

# --- STEP 2: Simulate Posts ---
num_posts = 1000
post_data = []
for _ in range(num_posts):
    inf_id = random.choice(influencers_df['ID'])
    follower_count = influencers_df.loc[influencers_df['ID'] == inf_id, 'follower_count'].values[0]
    reach = int(np.random.normal(loc=0.2 * follower_count, scale=0.05 * follower_count))
    reach = max(reach, 500)  # ensure a minimum reach

    post_data.append({
        'influencer_id': inf_id,
        'platform': random.choice(PLATFORMS),
        'date': fake.date_between(start_date='-1y', end_date='today'),
        'URL': fake.url(),
        'caption': fake.sentence(),
        'reach': reach,
        'likes': int(np.random.normal(loc=0.05 * reach, scale=0.02 * reach)),
        'comments': int(np.random.normal(loc=0.01 * reach, scale=0.005 * reach))
    })

posts_df = pd.DataFrame(post_data)

# --- STEP 3: Simulate Tracking Data ---
num_tracking = 3000
tracking_data = []
for _ in range(num_tracking):
    inf_id = random.choice(influencers_df['ID'])
    campaign = random.choice(CAMPAIGNS)
    product = random.choice(PRODUCTS)
    platform = influencers_df.loc[influencers_df['ID'] == inf_id, 'platform'].values[0]
    brand = random.choice(BRANDS)
    orders = np.random.poisson(2)
    revenue = round(np.random.uniform(20, 150) * orders, 2)

    tracking_data.append({
        'source': platform,
        'brand': brand,
        'campaign': campaign,
        'influencer_id': inf_id,
        'user_id': fake.uuid4(),
        'product': product,
        'date': fake.date_between(start_date='-1y', end_date='today'),
        'orders': orders,
        'revenue': revenue
    })

tracking_df = pd.DataFrame(tracking_data)

# --- STEP 4: Simulate Payouts ---
payout_data = []
for inf_id in influencers_df['ID']:
    basis = random.choice(['post', 'order'])
    rate = round(random.uniform(50, 300), 2)
    posts_count = len(posts_df[posts_df['influencer_id'] == inf_id])
    orders_count = tracking_df[tracking_df['influencer_id'] == inf_id]['orders'].sum()

    if basis == 'post':
        total_payout = posts_count * rate
        orders_recorded = 0
    else:
        total_payout = orders_count * rate
        orders_recorded = orders_count

    payout_data.append({
        'influencer_id': inf_id,
        'basis': basis,
        'rate': rate,
        'orders': orders_recorded,
        'total_payout': round(total_payout, 2)
    })

payouts_df = pd.DataFrame(payout_data)

# Save to disk
influencers_df.to_csv("influencers.csv", index=False)
posts_df.to_csv("posts.csv", index=False)
tracking_df.to_csv("tracking_data.csv", index=False)
payouts_df.to_csv("payouts.csv", index=False)

"influencers.csv", "posts.csv", "tracking_data.csv", "payouts.csv"


('influencers.csv', 'posts.csv', 'tracking_data.csv', 'payouts.csv')