# Generate Test Dataset

In [1]:
!pip install faker



## Import Library

In [2]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from collections import defaultdict
fake = Faker()

## Challenge dict

In [3]:
challenges = [
    {
        "chal_name": "Eco Commuter",
        "chal_description": "Try using public transportation 3 times a week, reducing your daily emission by 10%.",
        "target_type": "daily_emission",
        "reduction_goal": 0.1
    },
    {
        "chal_name": "Vehicle Switch Challenge",
        "chal_description": "Switch to electric vehicle or bike for a specific amount of kilometers.",
        "target_type": "vehicle_type",
        "required_vehicle": ["EV Car", "EV Cycle"]
    },
    {
        "chal_name": "Green Mile",
        "chal_description": "Walk or use bike for all trips under 3 km.",
        "target_type": "distance",
        "distance_limit": 3000
    },
    {
        "chal_name": "Zero Emissions Day",
        "chal_description": "Avoid using a car or motorcycle for a whole day and contribute to zero emissions.",
        "target_type": "vehicle_type",
        "required_vehicle": ["Walk", "EV Cycle"]
    },
    {
        "chal_name": "Park and Walk",
        "chal_description": "Try to reduce your car or motorcycle road trip distance by 20%. Park your vehicle further away and walk the rest of the way. ",
        "target_type": "distance",
        "distance_limit": 0.2
    },
    {
        "chal_name": "Car-Free Commute",
        "chal_description": "Take public transport, bike, or walk for your commute twice this week.",
        "target_type": "vehicle_type",
        "required_vehicle": ["Public Transport", "EV Cycle", "Walk"]
    },
    {
        "chal_name": "Weekend Walkabout",
        "chal_description": "Avoid using a car during the weekend.",
        "target_type": "daily_emission",
        "reduction_goal": 0.15
    },
    {
        "chal_name": "Errand Consolidator",
        "chal_description": "Combine multiple errands into one trip, reducing unnecessary travel.",
        "target_type": "time_travel",
        "reduction_goal": 0.2
    },
    {
        "chal_name": "Local Living",
        "chal_description": "Complete all your errands within a 3 km radius this week.",
        "target_type": "distance",
        "distance_limit": 3000
    },
    {
        "chal_name": "Carbon Saver",
        "chal_description": "Reduce daily emissions by 15% for one week.",
        "target_type": "daily_emission",
        "reduction_goal": 0.15
    },
    {
        "chal_name": "Green Groceries",
        "chal_description": "Walk or bike to the grocery store for all purchases.",
        "target_type": "vehicle_type",
        "required_vehicle": ["Walk", "EV Cycle"]
    },
    {
        "chal_name": "Weekend Warrior",
        "chal_description": "Bike or walk for all weekend activities within 5 km.",
        "target_type": "distance",
        "distance_limit": 5000
    },
    {
        "chal_name": "Nature Stroll",
        "chal_description": "Take a 30-minute walk in nature twice this week instead of a drive.",
        "target_type": "vehicle_type",
        "required_vehicle": "Walk"
    },
    {
        "chal_name": "Emissions Under 10",
        "chal_description": "Keep your daily emissions under 10 kg of CO2 for 7 days.",
        "target_type": "daily_emission",
        "emission_limit": 10000
    },
    {
        "chal_name": "Eco Grocer",
        "chal_description": "Walk or bike for all grocery shopping trips this week.",
        "target_type": "vehicle_type",
        "required_vehicle": ["Walk", "EV Cycle"]
    },
    {
        "chal_name": "School or Workplace Stroll",
        "chal_description": "Walk to school or your workplace 3 times a week.",
        "target_type": "vehicle_type",
        "required_vehicle": "Walk"
    },
    {
        "chal_name": "Daily Stepper",
        "chal_description": "Walk at least 3,000 steps instead of short drives each day.",
        "target_type": ["distance_type", "vehicle_type"],
        "distance_limit": 2400,
        "required_vehicle": "Walk"
    },
    {
        "chal_name": "Eco-Friendly Errand",
        "chal_description": "Complete all errands within 5 km radius by bike or foot.",
        "target_type": "vehicle_type",
        "distance_limit": 5000,
        "required_vehicle": ["EV Cycle", "Walk"]
    },
    {
        "chal_name": "Workday Walk",
        "chal_description": "Take a 15-minute walk after lunch each workday.",
        "target_type": ["time_travel", "vehicle_type"],
        "time_required": 15,
        "vehicle_type": "Walk"
    },
    {
        "chal_name": "Shop Local",
        "chal_description": "Purchase groceries from stores within a 2 km radius.",
        "target_type": "distance",
        "distance_limit": 2000
    },
    {
        "chal_name": "Low Emission Weekend",
        "chal_description": "Limit your emissions to under 5 kg of CO2 over the weekend.",
        "target_type": "daily_emission",
        "emission_limit": 5000
    },
    {
        "chal_name": "One Car-Free Day",
        "chal_description": "Go one day this week without using your car.",
        "target_type": "vehicle_type",
        "required_vehicle": ["Walk", "EV Cycle", "sepeda motor"]
    },
    {
        "chal_name": "Walk the Errands",
        "chal_description": "Complete errands by walking or biking within a 4 km radius.",
        "target_type": ["vehicle_type", "distance"],
        "distance_limit": 4000,
        "required_vehicle": ["Walk", "EV Cycle"]
    },
    {
        "chal_name": "Public Transport Experiment",
        "chal_description": "Use public transport for all work commutes this week.",
        "target_type": "vehicle_type",
        "required_vehicle": "Public Transport"
    },
    {
        "chal_name": "Two-Wheel Transport",
        "chal_description": "Use a bike for all trips under 10 km this week.",
        "target_type": ["vehicle_type", "distance"],
        "distance_limit": 10000,
        "required_vehicle": ["Bike", "EV Cycle", ]
    },
    {
        "chal_name": "Weekend Walkathon",
        "chal_description": "Walk at least 5 km this weekend instead of driving.",
        "target_type": ["distance","vehicle_type"],
        "distance_goal": 5000,
        "required_vehicle": "Walk"
    },
    {
        "chal_name": "Eco Friend Meetup",
        "chal_description": "Plan a meetup within 2 km to reduce emissions.",
        "target_type": "distance",
        "distance_limit": 2000
    },
    {
        "chal_name": "Cycle Commute Challenge",
        "chal_description": "Cycle to work at least 2 times this week.",
        "target_type": "vehicle_type",
        "required_vehicle": "Bike"
    },
    {
        "chal_name": "Ride Share",
        "chal_description": "Share rides for all trips longer than 10 km.",
        "target_type": "distance",
        "distance_limit": 10000
    },
      {
        "chal_name": "Two-Wheel Transport",
        "chal_description": "Use a bike or motorcycle for all trips under 10 km this week.",
        "target_type": ["vehicle_type", "distance"],
        "distance_limit": 10000,
        "required_vehicle": ["EV Cycle", "Motorcycle"]
    },
]

In [4]:
for i, challenge in enumerate(challenges, start=1):
    challenge["challenge_id"] = f"CHALLENGE{i:03d}"

for challenge in challenges:
    print(challenge)

{'chal_name': 'Eco Commuter', 'chal_description': 'Try using public transportation 3 times a week, reducing your daily emission by 10%.', 'target_type': 'daily_emission', 'reduction_goal': 0.1, 'challenge_id': 'CHALLENGE001'}
{'chal_name': 'Vehicle Switch Challenge', 'chal_description': 'Switch to electric vehicle or bike for a specific amount of kilometers.', 'target_type': 'vehicle_type', 'required_vehicle': ['EV Car', 'EV Cycle'], 'challenge_id': 'CHALLENGE002'}
{'chal_name': 'Green Mile', 'chal_description': 'Walk or use bike for all trips under 3 km.', 'target_type': 'distance', 'distance_limit': 3000, 'challenge_id': 'CHALLENGE003'}
{'chal_name': 'Zero Emissions Day', 'chal_description': 'Avoid using a car or motorcycle for a whole day and contribute to zero emissions.', 'target_type': 'vehicle_type', 'required_vehicle': ['Walk', 'EV Cycle'], 'challenge_id': 'CHALLENGE004'}
{'chal_name': 'Park and Walk', 'chal_description': 'Try to reduce your car or motorcycle road trip distance

## Calculate daily_emission, distance, and assign vehicle_type

In [5]:
pd.set_option('display.float_format', '{:.2f}'.format)

Generate user ID

In [6]:
num_users = 20
num_records_per_user = 10

user_ids = [fake.uuid4() for _ in range(num_users)]

In [7]:
emission_multipliers = {
    'mobil': 500,  # Car (high emission)
    'sepeda motor': 300,  # Motorcycle (moderate emission)
    'EV Car': 50,  # Electric Car (low emission)
    'EV Cycle': 20,  # Electric Cycle (very low emission)
    'Walk': 0  # Walking (no emission)
}

vehicle_types_list = ['mobil', 'sepeda motor', 'EV Car', 'EV Cycle', 'Walk']

num_records_per_user = 10
data = []

for user_id in user_ids:
    accumulated_emission = 0

    for _ in range(num_records_per_user):
        base_emission = np.random.normal(loc=2000, scale=500)
        daily_emission = base_emission * 1000

        accumulated_emission += daily_emission

        avg_distance_user_meters = np.random.normal(loc=25000, scale=5000)

        if daily_emission > 2000000:
            selected_vehicle = 'mobil'
        elif daily_emission > 1500000:
            selected_vehicle = 'sepeda motor'
        elif daily_emission > 1000000:
            selected_vehicle = 'EV Car'
        elif daily_emission > 600000:
            selected_vehicle = 'EV Cycle'
        else:
            selected_vehicle = 'Walk'

        data.append({
            'user_id': user_id,
            'daily_emission': daily_emission,
            'distance_user': avg_distance_user_meters,
            'vehicle_type': selected_vehicle
        })

In [8]:
synthetic_data = pd.DataFrame(data)
synthetic_data.head()

Unnamed: 0,user_id,daily_emission,distance_user,vehicle_type
0,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2323758.09,33315.71,mobil
1,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2633784.91,20674.25,mobil
2,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,1767598.7,27185.99,sepeda motor
3,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2174371.37,15880.74,mobil
4,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2310686.44,21915.77,mobil


In [9]:
vehicle_type_count = synthetic_data.groupby(['user_id', 'vehicle_type']).size().unstack(fill_value=0)

user_stats = synthetic_data.groupby('user_id').agg(
    avg_daily_emission=('daily_emission', 'mean'),
    avg_distance=('distance_user', 'mean')
).reset_index()

user_stats = user_stats.merge(vehicle_type_count, on='user_id', how='left')
user_stats.head()


Unnamed: 0,user_id,avg_daily_emission,avg_distance,EV Car,EV Cycle,Walk,mobil,sepeda motor
0,1aa08197-f0dd-42bb-86bc-81089dd01f9a,2108830.53,26264.03,0,0,0,4,6
1,1c7a014d-48a2-4d08-9e9f-fb2cd00bbde2,1710197.08,23592.37,3,0,0,2,5
2,3490af50-9108-49fe-b208-534a5bdd0fc0,2041719.9,24629.1,2,0,0,6,2
3,499c9215-8036-41d0-986a-9501111a31d3,2062745.11,25201.83,0,0,0,5,5
4,53804898-9ece-45c4-a43b-6bfebd90c05c,2223978.66,26153.0,0,0,0,6,4


In [10]:
merge_user = synthetic_data.merge(user_stats, on='user_id', how='left')
merge_user.head()

Unnamed: 0,user_id,daily_emission,distance_user,vehicle_type,avg_daily_emission,avg_distance,EV Car,EV Cycle,Walk,mobil,sepeda motor
0,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2323758.09,33315.71,mobil,2092448.76,23248.36,1,0,0,7,2
1,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2633784.91,20674.25,mobil,2092448.76,23248.36,1,0,0,7,2
2,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,1767598.7,27185.99,sepeda motor,2092448.76,23248.36,1,0,0,7,2
3,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2174371.37,15880.74,mobil,2092448.76,23248.36,1,0,0,7,2
4,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2310686.44,21915.77,mobil,2092448.76,23248.36,1,0,0,7,2


## Assign 5 Challenges for each user

In [11]:
def check_challenge_condition(row, avg_emission_total, avg_distance_total, challenge):
    user_id = row['user_id']

    if challenge['target_type'] == 'daily_emission':
        if 'reduction_goal' in challenge:
            if row['avg_daily_emission'] >= avg_emission_total * (1 - challenge['reduction_goal']):
                return (user_id, challenge['challenge_id'])

    elif challenge['target_type'] == 'distance':
        if 'distance_limit' in challenge:
            if row['avg_distance'] >= avg_distance_total * (1 - challenge['distance_limit']):
                return (user_id, challenge['challenge_id'])

    elif challenge['target_type'] == 'vehicle_type':
        if 'required_vehicle' in challenge:
            required_vehicles = challenge['required_vehicle']

            for vehicle in required_vehicles:
                if vehicle == 'EV Car' and row['EV Car'] > 0:
                    if row['EV Car'] < 2:
                        return (user_id, challenge['challenge_id'])
                elif vehicle == 'EV Cycle' and row['EV Cycle'] > 0:
                    if row['EV Cycle'] < 2:
                        return (user_id, challenge['challenge_id'])
                elif vehicle == 'Walk' and row['Walk'] > 0:
                    if row['Walk'] < 2:
                        return (user_id, challenge['challenge_id'])
                elif vehicle == 'mobil' and row['mobil'] > 0:
                    if row['mobil'] < 3:
                        return (user_id, challenge['challenge_id'])
                elif vehicle == 'sepeda motor' and row['sepeda motor'] > 0:
                    if row['sepeda motor'] < 3:
                        return (user_id, challenge['challenge_id'])

    return (None, None)


In [12]:
def assign_challenges(df_csv, challenges, avg_emission_total, avg_distance_total):

    user_challenges = defaultdict(list)

    user_ids = df_csv['user_id'].unique()

    for user_id in user_ids:
        user_rows = df_csv[df_csv['user_id'] == user_id]

        random.shuffle(challenges)

        challenge_count = 0
        for _, row in user_rows.iterrows():
            if challenge_count < 10:
                for challenge in challenges:
                    if challenge['challenge_id'] not in user_challenges[user_id]:
                        challenge_result = check_challenge_condition(row, avg_emission_total, avg_distance_total, challenge)
                        if challenge_result != (None, None):
                            df_csv.loc[row.name, 'challenge_id'] = challenge['challenge_id']
                            df_csv.loc[row.name, 'chal_name'] = challenge['chal_name']
                            user_challenges[user_id].append(challenge['challenge_id'])
                            challenge_count += 1
                            break

    return df_csv

In [13]:
avg_emission_total = merge_user['avg_daily_emission'].mean()
avg_distance_total = merge_user['avg_distance'].mean()

print(avg_emission_total)
print(avg_distance_total)

1984212.3396136272
25246.23039428184


In [14]:
challenges_df = assign_challenges(merge_user, challenges, avg_emission_total, avg_distance_total)

In [15]:
challenges_df.head()

Unnamed: 0,user_id,daily_emission,distance_user,vehicle_type,avg_daily_emission,avg_distance,EV Car,EV Cycle,Walk,mobil,sepeda motor,challenge_id,chal_name
0,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2323758.09,33315.71,mobil,2092448.76,23248.36,1,0,0,7,2,CHALLENGE003,Green Mile
1,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2633784.91,20674.25,mobil,2092448.76,23248.36,1,0,0,7,2,CHALLENGE001,Eco Commuter
2,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,1767598.7,27185.99,sepeda motor,2092448.76,23248.36,1,0,0,7,2,CHALLENGE029,Ride Share
3,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2174371.37,15880.74,mobil,2092448.76,23248.36,1,0,0,7,2,CHALLENGE022,One Car-Free Day
4,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2310686.44,21915.77,mobil,2092448.76,23248.36,1,0,0,7,2,CHALLENGE012,Weekend Warrior


In [17]:
challenges_df.drop(['EV Car', 'EV Cycle', 'Walk', 'mobil', 'sepeda motor'], axis=1, inplace=True)

In [18]:
challenges_df.head()

Unnamed: 0,user_id,daily_emission,distance_user,vehicle_type,avg_daily_emission,avg_distance,challenge_id,chal_name
0,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2323758.09,33315.71,mobil,2092448.76,23248.36,CHALLENGE003,Green Mile
1,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2633784.91,20674.25,mobil,2092448.76,23248.36,CHALLENGE001,Eco Commuter
2,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,1767598.7,27185.99,sepeda motor,2092448.76,23248.36,CHALLENGE029,Ride Share
3,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2174371.37,15880.74,mobil,2092448.76,23248.36,CHALLENGE022,One Car-Free Day
4,f9720db0-e7a9-4f15-b0f3-9bdcc7e2817c,2310686.44,21915.77,mobil,2092448.76,23248.36,CHALLENGE012,Weekend Warrior


In [20]:
challenges_df['challenge_id'].fillna('CHALLENGE04', inplace=True)
challenges_df['chal_name'].fillna('Zero Emissions Day', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  challenges_df['challenge_id'].fillna('CHALLENGE04', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  challenges_df['chal_name'].fillna('Zero Emissions Day', inplace=True)


In [21]:
test_data = "test_data.csv"
challenges_df.to_csv(test_data, index=False)