In [1]:
import os
import numpy as np
import pandas as pd

raw_files = "../data/raw_files/"
processed_files = "../data/processed_files/"

### Marketing Dataset

In [2]:
marketing_df = pd.read_csv(raw_files + "marketing_product.csv")
print(marketing_df.shape)
marketing_df.head(5)

(10000, 18)


Unnamed: 0,Campaign_ID,Product_ID,Budget,Clicks,Conversions,Revenue_Generated,ROI,Customer_ID,Subscription_Tier,Subscription_Length,Flash_Sale_ID,Discount_Level,Units_Sold,Bundle_ID,Bundle_Price,Customer_Satisfaction_Post_Refund,Common_Keywords,Traffic_Type
0,CMP_RLSDVN,PROD_HBJFA3,41770.45,4946,73,15520.09,1.94,CUST_1K7G39,Premium,4,FLASH_1VFK5K,43,34,BNDL_29U6W5,433.8,4,Affordable,Organic
1,CMP_JHHUE9,PROD_OE8YNJ,29900.93,570,510,30866.17,0.76,CUST_0DWS6F,Premium,4,FLASH_1M6COK,28,97,BNDL_ULV60J,289.29,2,Innovative,Social
2,CMP_6SBOWN,PROD_4V8A08,22367.45,3546,265,32585.62,1.41,CUST_BR2GST,Basic,9,FLASH_J4PEON,51,160,BNDL_0HY0EF,462.87,4,Affordable,Referral
3,CMP_Q31QCU,PROD_A1Q6ZB,29957.54,2573,781,95740.12,3.32,CUST_6TBY6K,Premium,32,FLASH_1TOVXT,36,159,BNDL_AI09BC,334.16,1,Durable,Referral
4,CMP_AY0UTJ,PROD_F57N66,36277.19,818,79,81990.43,3.53,CUST_XASI45,Standard,29,FLASH_AOBHXL,20,52,BNDL_R03ITT,371.67,2,Affordable,Organic


In [3]:
# standardize headers
marketing_df.columns = (marketing_df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_"))
marketing_df.columns

Index(['campaign_id', 'product_id', 'budget', 'clicks', 'conversions',
       'revenue_generated', 'roi', 'customer_id', 'subscription_tier',
       'subscription_length', 'flash_sale_id', 'discount_level', 'units_sold',
       'bundle_id', 'bundle_price', 'customer_satisfaction_post_refund',
       'common_keywords', 'traffic_type'],
      dtype='object')

In [4]:
# Trim whitespace in text columns; normalize subscription_tier casing if present
for c in ["campaign_id","product_id","customer_id","subscription_tier",
          "flash_sale_id","bundle_id","common_keywords"]:
    if c in marketing_df.columns and marketing_df[c].dtype == "object":
        marketing_df[c] = marketing_df[c].astype(str).str.strip()

#### SEO Dataset

In [5]:
seo_df = pd.read_csv(raw_files + "seo_keyword.csv")
print(seo_df.shape)
seo_df.head(5)

(118, 7)


Unnamed: 0.1,Unnamed: 0,text,cpc,vol,v,competition,score
0,0,email marketing,1.74,27100,27100,low,0.435
1,1,emailing marketing,1.74,27100,27100,low,0.435
2,2,marketing emails,1.44,22200,22200,low,0.36
3,3,email templates,0.91,9900,9900,low,0.2275
4,4,what is email advertising,0.49,6600,6600,low,0.1225


In [6]:
# standardize headers
seo_df.columns = (seo_df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_"))

# drop unwanted columns: unnamed, v
seo_df = seo_df.drop(columns=["unnamed:_0", "v"], errors="ignore")

# rename text -> keyword, vol -> search_volume
seo_df = seo_df.rename(columns={"text": "keyword", "vol": "search_volume"})

seo_df.head(5)

Unnamed: 0,keyword,cpc,search_volume,competition,score
0,email marketing,1.74,27100,low,0.435
1,emailing marketing,1.74,27100,low,0.435
2,marketing emails,1.44,22200,low,0.36
3,email templates,0.91,9900,low,0.2275
4,what is email advertising,0.49,6600,low,0.1225


#### Web Campaign data

In [7]:
web_df = pd.read_csv(raw_files + "web_campaign.csv")
print(web_df.shape)
web_df.head(5)

(10000, 12)


Unnamed: 0,Date,User_ID,Session_Duration,Page_Views,Source,Medium,Campaign,Device_Category,Country,New_User,Conversions,Revenue
0,6/6/2023,1,448,7,Referral,Direct,Spring Promo,Tablet,USA,1,2,124
1,6/19/2023,2,94,1,Referral,Social Media,Summer Sale,Desktop,India,1,1,130
2,6/29/2023,3,595,6,Direct,Referral,Winter Campaign,Tablet,India,1,1,136
3,6/1/2023,4,263,9,Social,Organic Search,,Tablet,Australia,1,0,0
4,6/30/2023,5,242,1,Referral,Referral,Spring Promo,Tablet,USA,0,0,0


In [8]:
# Tidy column names 
web_df.columns = (web_df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_"))
web_df.columns

Index(['date', 'user_id', 'session_duration', 'page_views', 'source', 'medium',
       'campaign', 'device_category', 'country', 'new_user', 'conversions',
       'revenue'],
      dtype='object')

In [9]:
# Dates
web_df["date"] = pd.to_datetime(web_df["date"], errors="coerce")

# Numerics (convert strings to NaN)
for c in ["session_duration","page_views","conversions","revenue","new_user"]:
    if c in web_df.columns:
        web_df[c] = pd.to_numeric(web_df[c], errors="coerce")

# Fill basic nulls (keep integers where appropriate)
web_df["page_views"]  = web_df.get("page_views", 0).fillna(0).astype("Int64")
web_df["conversions"] = web_df.get("conversions", 0).fillna(0).astype("Int64")
web_df["revenue"]     = web_df.get("revenue", 0).fillna(0.0)
web_df["new_user"]    = web_df.get("new_user", 0).fillna(0).astype("Int64")

In [10]:
marketing_df.to_csv(processed_files + "clean_marketing_product.csv", index=False, float_format="%.4f")
seo_df.to_csv(processed_files + "clean_seo_keywords.csv", index=False, float_format="%.4f")
web_df.to_csv(processed_files + "clean_campaign.csv", index=False, float_format="%.4f")

print("✔ Saved cleaned datasets to:", processed_files)


✔ Saved cleaned datasets to: ../data/processed_files/
