In [1]:
#import necessary library
import numpy as np # linear algebra
import pandas as pd # data manipulation and analysis
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
sns.set_style('whitegrid') # set style for visualization
import warnings # ignore warnings
warnings.filterwarnings('ignore')
from initial_report import *

In [2]:
#import dataset
df_offers=pd.read_csv("offers.csv")

In [3]:
#randomly get 5 rows
df_offers.sample(5)

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels
5,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,7,3,7,"['web', 'email', 'mobile', 'social']"
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,"['web', 'email']"
8,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5,5,5,"['web', 'email', 'mobile', 'social']"
7,5a8bc65990b245e5a138643cd4eb9837,informational,0,0,3,"['email', 'mobile', 'social']"
9,2906b810c7d4411798c6938adc9daaa5,discount,10,2,7,"['web', 'email', 'mobile']"


In [4]:
#get the initial report
initial_report(df_offers)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 10
- Total Columns: 6
- Column Names: ['offer_id', 'offer_type', 'difficulty', 'reward', 'duration', 'channels']

📌 Data Types:
  offer_id: object
  offer_type: object
  difficulty: int64
  reward: int64
  duration: int64
  channels: object

🧬 Mixed Data Types:

*** Distinct Values per Column:
  offer_id: 10
  offer_type: 3
  difficulty: 5
  reward: 5
  duration: 5
  channels: 4

*** Null Values and Percentages:


*** Duplicates: 0

*** Negative or Zero Values:
  difficulty: 2
  reward: 2

*** Basic Statistics:
       difficulty     reward   duration
count   10.000000  10.000000  10.000000
mean     7.700000   4.200000   6.500000
std      5.831905   3.583915   2.321398
min      0.000000   0.000000   3.000000
25%      5.000000   2.000000   5.000000
50%      8.500000   4.000000   7.000000
75%     10.000000   5.000000   7.000000
max     20.000000  10.000000  10.000000

*** Category Descri

1. Channel column could be list
2. difficulty and reward column has negative/zero values, need check
3. difficulty and reward column has outliers.
4. check category columns
   

In [5]:
#Find negative values in difficulty columns
df_offers[df_offers.difficulty<=0]

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,"['web', 'email', 'mobile']"
7,5a8bc65990b245e5a138643cd4eb9837,informational,0,0,3,"['email', 'mobile', 'social']"


1. Difficulty could be 0 here

In [6]:
#Find negative values in reward columns
df_offers[df_offers.reward<=0]

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,"['web', 'email', 'mobile']"
7,5a8bc65990b245e5a138643cd4eb9837,informational,0,0,3,"['email', 'mobile', 'social']"


1. Reward could be 0 here. 

In [7]:
#import check_outlier function
from check_outlier import *

In [8]:
#check outlier for difficulty column
check_outlier(df_offers, 'difficulty')

📊 Outlier Summary for `difficulty`:
----------------------------------------
Q1 (25th percentile): 5.00
Q3 (75th percentile): 10.00
IQR (Q3 - Q1): 5.00
Lower bound: -2.50
Upper bound: 17.50
Number of outliers: 1
----------------------------------------


Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels,outlier_flag
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,"['web', 'email']",yes


1. not significant

In [12]:
#check outlier for reward column
check_outlier(df_offers, "reward")

📊 Outlier Summary for `reward`:
----------------------------------------
Q1 (25th percentile): 2.00
Q3 (75th percentile): 5.00
IQR (Q3 - Q1): 3.00
Lower bound: -2.50
Upper bound: 9.50
Number of outliers: 2
----------------------------------------


Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels,outlier_flag
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,"['email', 'mobile', 'social']",yes
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,"['web', 'email', 'mobile', 'social']",yes


1. not signicant

In [14]:
#get value from offer_type column
df_offers.offer_type.value_counts()

offer_type
bogo             4
discount         4
informational    2
Name: count, dtype: int64

In [17]:
#get value from channel column
df_offers.channels.value_counts()

channels
['web', 'email', 'mobile', 'social']    4
['web', 'email', 'mobile']              3
['email', 'mobile', 'social']           2
['web', 'email']                        1
Name: count, dtype: int64

In [18]:
#convert channel column to seperat binary column for easier analysis
# Create dummy variables
channel_dummies = df_offers['channels'].apply(lambda x: pd.Series({
    'web': int('web' in x),
    'email': int('email' in x),
    'mobile': int('mobile' in x),
    'social': int('social' in x)
}))

# Combine with original dataframe
df_offers = pd.concat([df_offers.drop(columns=['channels']), channel_dummies], axis=1)


In [19]:
df_offers

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,web,email,mobile,social
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,0,1,1,1
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,1,1,1,1
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,1,1,1,0
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,5,7,1,1,1,0
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,1,1,0,0
5,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,7,3,7,1,1,1,1
6,fafdcd668e3743c1bb461111dcafc2a4,discount,10,2,10,1,1,1,1
7,5a8bc65990b245e5a138643cd4eb9837,informational,0,0,3,0,1,1,1
8,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5,5,5,1,1,1,1
9,2906b810c7d4411798c6938adc9daaa5,discount,10,2,7,1,1,1,0
