In [3]:
import pandas as pd


In [5]:
#1️⃣ Setup & Load Data
#Load the dataset

df = pd.read_csv("sales_with_nans.csv")

In [6]:
#Inspect the first few rows
df.head()

Unnamed: 0,Group,Customer_Segment,Sales_Before,Sales_After,Customer_Satisfaction_Before,Customer_Satisfaction_After,Purchase_Made
0,Control,High Value,240.548359,300.007568,74.684767,,No
1,Treatment,High Value,246.862114,381.337555,100.0,100.0,Yes
2,Control,High Value,156.978084,179.330464,98.780735,100.0,No
3,Control,Medium Value,192.126708,229.278031,49.333766,39.811841,Yes
4,,High Value,229.685623,,83.974852,87.738591,Yes


In [9]:
#Check column names and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Group                         8599 non-null   object 
 1   Customer_Segment              8034 non-null   object 
 2   Sales_Before                  8478 non-null   float64
 3   Sales_After                   9233 non-null   float64
 4   Customer_Satisfaction_Before  8330 non-null   float64
 5   Customer_Satisfaction_After   8360 non-null   float64
 6   Purchase_Made                 9195 non-null   object 
dtypes: float64(4), object(3)
memory usage: 547.0+ KB


In [16]:
#2️⃣ Basic Statistics

#Total sales before & after
print ("Total Sales_Before:",df["Sales_Before"].sum(),"Total Sales_After", df["Sales_After"].sum())

Sales_Before: 1727112.7109819734 Sales_After 2589468.2727230373


In [33]:
#Average sales before & after
avg_sales_before = df["Sales_Before"].mean()
avg_sales_after = df["Sales_After"].mean()

print("Average Sales Before:", avg_sales_before)
print("Average Sales After :", avg_sales_after)

Average Sales Before: 203.71699822858852
Average Sales After : 280.45795220654577


In [35]:
#Average satisfaction before & after
avg_sat_before = df["Customer_Satisfaction_Before"].mean()
avg_sat_after = df["Customer_Satisfaction_After"].mean()

print("Average satisfaction before:", avg_sat_before)
print("Average satisfaction after:", avg_sat_after)

Average satisfaction before: 70.25207641696215
Average satisfaction after: 73.87259310735006


In [37]:
#Minimum and maximum sales values
min_sales_before = df["Sales_Before"].min()
max_sales_after = df["Sales_Before"].max()
print("Min sales values before:", min_sales_before)
print("Max sales values before:", max_sales_after)

min_sales_after = df["Sales_After"].min()
max_sales_after= df["Sales_After"].max()
print("Min sales values after:", min_sales_after)
print("Max sales values after:", df["Sales_After"].max())



Min sales values before: 24.85296575372195
Max sales values before: 545.4225471380589
Min sales values after: 32.414352282538246
Max sales values after: 818.2199974644652


In [41]:
#Calculate the change in average sales and change in satisfaction from before to after
change_sales = avg_sales_after - avg_sales_before
print(f"Change in average sales: {change_sales:.2f}")

change_sat = avg_sat_after - avg_sat_before
print(f"Change in average satisfaction: {change_sat:.2f}")

Change in average sales: 76.74
Change in average satisfaction: 3.62


In [46]:
#3️⃣ Filtering & Subsets
#Get all rows for a specific customer_segment, eg high value
high_value =df[df["Customer_Segment"] == "High Value"]
print(high_value.head(10))


        Group Customer_Segment  Sales_Before  Sales_After  \
0     Control       High Value    240.548359   300.007568   
1   Treatment       High Value    246.862114   381.337555   
2     Control       High Value    156.978084   179.330464   
4         NaN       High Value    229.685623          NaN   
6     Control       High Value    191.713918   222.409356   
8         NaN       High Value    208.308577   248.178830   
9   Treatment       High Value    235.071493   352.756872   
11    Control       High Value           NaN   333.064972   
13    Control       High Value    217.776013   259.989624   
20  Treatment       High Value    225.163165   355.210451   

    Customer_Satisfaction_Before  Customer_Satisfaction_After Purchase_Made  
0                      74.684767                          NaN            No  
1                     100.000000                   100.000000           Yes  
2                      98.780735                   100.000000            No  
4               

In [47]:
#Get all customers with sales above the average
customers_above_avg = df[(df["Sales_After"] > avg_sales_after) & (df["Sales_Before"] > avg_sales_before)]
print(customers_above_avg)

          Group Customer_Segment  Sales_Before  Sales_After  \
0       Control       High Value    240.548359   300.007568   
1     Treatment       High Value    246.862114   381.337555   
9     Treatment       High Value    235.071493   352.756872   
16    Treatment              NaN    306.701452   485.135424   
20    Treatment       High Value    225.163165   355.210451   
...         ...              ...           ...          ...   
9986    Control              NaN    227.494626   281.003011   
9988  Treatment        Low Value    211.535677   323.226205   
9992  Treatment     Medium Value    209.913990   320.791145   
9995  Treatment              NaN    259.695935   415.181694   
9997  Treatment        Low Value    208.107142   322.893351   

      Customer_Satisfaction_Before  Customer_Satisfaction_After Purchase_Made  
0                        74.684767                          NaN            No  
1                       100.000000                   100.000000           Yes  
9  

In [50]:
#Filter by multiple conditions (e.g., satisfaction ≥ 4 and sales > 100)
filtered = df[(df["Customer_Satisfaction_After"] >= avg_sat_after) & (df["Sales_After"] > avg_sat_before)]
print(filtered)

          Group Customer_Segment  Sales_Before  Sales_After  \
1     Treatment       High Value    246.862114   381.337555   
2       Control       High Value    156.978084   179.330464   
6       Control       High Value    191.713918   222.409356   
8           NaN       High Value    208.308577   248.178830   
12      Control              NaN    211.834937   254.843912   
...         ...              ...           ...          ...   
9992  Treatment     Medium Value    209.913990   320.791145   
9993  Treatment              NaN    163.289805   248.916244   
9995  Treatment              NaN    259.695935   415.181694   
9996    Control       High Value    186.488285   216.225457   
9998  Treatment     Medium Value           NaN   431.974901   

      Customer_Satisfaction_Before  Customer_Satisfaction_After Purchase_Made  
1                       100.000000                   100.000000           Yes  
2                        98.780735                   100.000000            No  
6  

In [67]:
#Drop Rows with missing key values
df_clean = df.dropna(subset=["Group"])
print(df_clean)

          Group Customer_Segment  Sales_Before  Sales_After  \
0       Control       High Value    240.548359   300.007568   
1     Treatment       High Value    246.862114   381.337555   
2       Control       High Value    156.978084   179.330464   
3       Control     Medium Value    192.126708   229.278031   
5     Treatment              NaN    135.573003   218.559988   
...         ...              ...           ...          ...   
9995  Treatment              NaN    259.695935   415.181694   
9996    Control       High Value    186.488285   216.225457   
9997  Treatment        Low Value    208.107142   322.893351   
9998  Treatment     Medium Value           NaN   431.974901   
9999    Control        Low Value           NaN   124.402398   

      Customer_Satisfaction_Before  Customer_Satisfaction_After Purchase_Made  
0                        74.684767                    73.872593            No  
1                       100.000000                   100.000000           Yes  
2  

In [56]:
#fill missing satisfaction values with the mean
df["Customer_Satisfaction_Before"] = df["Customer_Satisfaction_Before"].fillna(df["Customer_Satisfaction_Before"].mean())
df["Customer_Satisfaction_After" ]= df["Customer_Satisfaction_After"].fillna(df["Customer_Satisfaction_After"].mean())
df.head(20)


Unnamed: 0,Group,Customer_Segment,Sales_Before,Sales_After,Customer_Satisfaction_Before,Customer_Satisfaction_After,Purchase_Made
0,Control,High Value,240.548359,300.007568,74.684767,73.872593,No
1,Treatment,High Value,246.862114,381.337555,100.0,100.0,Yes
2,Control,High Value,156.978084,179.330464,98.780735,100.0,No
3,Control,Medium Value,192.126708,229.278031,49.333766,39.811841,Yes
4,,High Value,229.685623,280.457952,83.974852,87.738591,Yes
5,Treatment,,135.573003,218.559988,58.075342,69.404918,No
6,Control,High Value,191.713918,222.409356,89.967827,85.120975,Yes
7,Control,Low Value,173.752555,213.168232,66.984711,67.881558,
8,,High Value,208.308577,248.17883,95.36667,84.790294,Yes
9,Treatment,High Value,235.071493,352.756872,72.919851,70.753225,No


In [60]:
#Try filling missing Purchase_Made with "Unknown".

df["Purchase_Made"] = df["Purchase_Made"].fillna("Unknown")
df.head(20)

Unnamed: 0,Group,Customer_Segment,Sales_Before,Sales_After,Customer_Satisfaction_Before,Customer_Satisfaction_After,Purchase_Made
0,Control,High Value,240.548359,300.007568,74.684767,73.872593,No
1,Treatment,High Value,246.862114,381.337555,100.0,100.0,Yes
2,Control,High Value,156.978084,179.330464,98.780735,100.0,No
3,Control,Medium Value,192.126708,229.278031,49.333766,39.811841,Yes
4,,High Value,229.685623,280.457952,83.974852,87.738591,Yes
5,Treatment,,135.573003,218.559988,58.075342,69.404918,No
6,Control,High Value,191.713918,222.409356,89.967827,85.120975,Yes
7,Control,Low Value,173.752555,213.168232,66.984711,67.881558,Unknown
8,,High Value,208.308577,248.17883,95.36667,84.790294,Yes
9,Treatment,High Value,235.071493,352.756872,72.919851,70.753225,No


In [66]:
#4️⃣ Aggregation with GroupBy
#avg sales before / after customer segment 
group_by_avg_sales_segment = df.groupby("Customer_Segment").agg(
                                        avg_sales_before =("Sales_Before", "mean"),
                                        avg_sales_after=("Sales_After", "mean")).reset_index()

group_by_avg_sales_segment.head(20)

Unnamed: 0,Customer_Segment,avg_sales_before,avg_sales_after
0,High Value,224.321621,305.954647
1,Low Value,182.721746,254.066006
2,Medium Value,203.964615,281.125125


In [70]:
#Satisfaction improvement by Group (Control vs Treatment)
satisfaction_improvement = df.groupby("Group").agg( avg_before = ("Customer_Satisfaction_Before", "mean"), 
                                                    avg_after = ("Customer_Satisfaction_After", "mean")).reset_index()
                                                    
satisfaction_improvement["Improvement"] =  satisfaction_improvement["avg_after"] - satisfaction_improvement["avg_before"]   
print(satisfaction_improvement)

       Group  avg_before  avg_after  Improvement
0    Control   70.513027  74.133348     3.620321
1  Treatment   70.038991  73.723728     3.684737


In [73]:
#Find which Customer_Segment had the biggest sales increase.
sales_increase = df.groupby("Customer_Segment").agg(
    avg_sales_before=("Sales_Before", "mean"),
    avg_sales_after=("Sales_After", "mean")
).reset_index()

sales_increase["increase"] = (
    sales_increase["avg_sales_after"] - sales_increase["avg_sales_before"]
)

biggest_increase = sales_increase.loc[sales_increase["increase"].idxmax()]
print(biggest_increase)

Customer_Segment    High Value
avg_sales_before    224.321621
avg_sales_after     305.954647
increase             81.633026
Name: 0, dtype: object


In [74]:
#6️⃣ Simple Pivot Table
#Create a pivot table of average satisfaction by region and gender


In [81]:
#Create a pivot showing satisfaction change by Purchase_Made.
