# Data loading and checks

In [1]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("loveall/clicks-conversion-tracking")

dtypes = {
        'ad_id': pd.Int64Dtype(),
        'campaign_id': pd.Int64Dtype(),
        'fb_campaign_id': pd.Int64Dtype(),
        'age': pd.StringDtype(),
        'gender': pd.StringDtype(),
        'interest_id': pd.Int64Dtype(),
        'impressions': pd.Int64Dtype(),
        'clicks': pd.Int64Dtype(),
        'spent': pd.Float64Dtype(),
        'total_conversion': pd.Int64Dtype(),
        'approved_conversion': pd.Int64Dtype(),
    }

conversion_data = pd.read_csv(path + "/KAG_conversion_data.csv", dtype=dtypes,)

conversion_data.rename(columns={"xyz_campaign_id": "campaign_id","Impressions":"impressions", "Clicks": "clicks", "Spent": "spent", "Total_Conversion": "total_conversion","Approved_Conversion":"approved_conversion", "interest": "interest_id"}, inplace=True)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
# Check for missing values
if conversion_data.isnull().any().any():
    print("Missing values")

# Validate 'age' column
invalid_ages = ~conversion_data['age'].isin(["30-34", "35-39", "40-44", "45-49"])
if invalid_ages.any():
    print("Invalid age")

# Validate 'gender' column
invalid_genders = ~conversion_data['gender'].isin(["M", "F"])
if invalid_genders.any():
    print("Invalid Gender")

# Validate numeric columns for negative values
negative_columns = {
    'clicks': "Negative clicks",
    'impressions': "Negative Impressions",
    'spent': "Negative Spent",
    'total_conversion': "Negative Total_Conversion",
    'approved_conversion': "Negative Approved_Conversion"
}

for column, message in negative_columns.items():
    if (conversion_data[column] < 0).any():
        print(message)



# Transform Data

In [3]:
conversion_data.drop_duplicates(inplace=True)

conversion_data['spent'] = conversion_data['spent'].round(2)

conversion_data.to_csv("conversion_data.csv", index=False)