### Load packages


In [16]:
import os
import numpy as np
import pandas as pd

## Extract


### Get the raw data


In [17]:
raw_file = "/home/joan/Projects/python/call_center_pipeline/data/5311_Call_Centre_-_Raw_dataset_20250901.csv"
if os.path.exists(raw_file):
    df = pd.read_csv(raw_file)
    df.head()
else:
    print(f"File not found: {raw_file}")

## Transform


### Basic data inspection


In [18]:
# Check basic data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     476 non-null    object
 1   Date                   476 non-null    object
 2   APPLICATION            476 non-null    object
 3   OFFERED                476 non-null    int64 
 4   ANSWERED               476 non-null    int64 
 5   AVG ANS DELAY          476 non-null    object
 6   AVG ANS DELAY SECONDS  476 non-null    object
 7   MAX ANS DELAY          476 non-null    object
 8   ABANDONED              476 non-null    int64 
 9   DelayCallsProduct      476 non-null    object
dtypes: int64(3), object(7)
memory usage: 37.3+ KB


In [19]:
# Display head
df.head()

Unnamed: 0,id,Date,APPLICATION,OFFERED,ANSWERED,AVG ANS DELAY,AVG ANS DELAY SECONDS,MAX ANS DELAY,ABANDONED,DelayCallsProduct
0,2018-02-14-dba_cs_268_5311_gen_inq_script,2018/02/14,dba_cs_268_5311_gen_inq_script,423,375,00:02:44,164,00:09:47,48,61500
1,2018-02-15-dba_cs_268_5311_gen_inq_script,2018/02/15,dba_cs_268_5311_gen_inq_script,395,369,00:00:50,50,00:07:15,26,18450
2,2018-02-16-dba_cs_268_5311_gen_inq_script,2018/02/16,dba_cs_268_5311_gen_inq_script,369,349,00:01:21,81,00:07:14,19,28269
3,2018-02-19-dba_cs_268_5311_gen_inq_script,2018/02/19,dba_cs_268_5311_gen_inq_script,0,0,00:00:00,0,00:00:00,0,0
4,2018-02-20-dba_cs_268_5311_gen_inq_script,2018/02/20,dba_cs_268_5311_gen_inq_script,558,449,00:05:24,324,00:21:48,109,145476


In [20]:
# Summary Statistics
df.describe()

Unnamed: 0,OFFERED,ANSWERED,ABANDONED
count,476.0,476.0,476.0
mean,294.306723,180.12395,70.115546
std,154.309158,116.931538,51.975108
min,0.0,0.0,0.0
25%,232.75,99.75,23.75
50%,329.0,167.0,65.0
75%,408.0,284.25,107.25
max,672.0,473.0,210.0


### Data Cleaning


In [21]:
# Lets check for missing values
df.isnull().sum()

id                       0
Date                     0
APPLICATION              0
OFFERED                  0
ANSWERED                 0
AVG ANS DELAY            0
AVG ANS DELAY SECONDS    0
MAX ANS DELAY            0
ABANDONED                0
DelayCallsProduct        0
dtype: int64

Lucky us that there are no missing values! Data is clean. Bit of a miracle. We still have lot of work to do.


In [22]:
# Let's check for dupes
duplicates = df[df.duplicated()]


if not duplicates.empty:
    print(f"Dupes found: {len(duplicates)} rows. Please deal with them")
else:
    print("No dupes. We are good to go.")

No dupes. We are good to go.


In [23]:
# Let's transform those columns names to lower case and replace the blank spaces to lower case
print("Original DataFrame columns:")
print(df.columns)

# Convert to lower case
df.columns = df.columns.str.lower()

# Convert to lower case
df.columns = df.columns.str.replace(" ", "_")

print("\nDataFrame columns after converting to lowercase:")
print(df.columns)

Original DataFrame columns:
Index(['id', 'Date', 'APPLICATION', 'OFFERED', 'ANSWERED', 'AVG ANS DELAY',
       'AVG ANS DELAY SECONDS', 'MAX ANS DELAY', 'ABANDONED',
       'DelayCallsProduct'],
      dtype='object')

DataFrame columns after converting to lowercase:
Index(['id', 'date', 'application', 'offered', 'answered', 'avg_ans_delay',
       'avg_ans_delay_seconds', 'max_ans_delay', 'abandoned',
       'delaycallsproduct'],
      dtype='object')


In [24]:
# Let handle that 'delaycallsproduct' column and make
df.rename(columns={"delaycallsproduct": "delay_calls_product"}, inplace=True)
df.columns

Index(['id', 'date', 'application', 'offered', 'answered', 'avg_ans_delay',
       'avg_ans_delay_seconds', 'max_ans_delay', 'abandoned',
       'delay_calls_product'],
      dtype='object')

In [25]:
# Convert the dates into datetime objects
print(df.columns)  # confirm that the column name is 'date'
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Change date format into MM/DD/YYYY
df["date"] = df["date"].dt.strftime("%m/%d/%Y")
print(df["date"].head())

Index(['id', 'date', 'application', 'offered', 'answered', 'avg_ans_delay',
       'avg_ans_delay_seconds', 'max_ans_delay', 'abandoned',
       'delay_calls_product'],
      dtype='object')
0    02/14/2018
1    02/15/2018
2    02/16/2018
3    02/19/2018
4    02/20/2018
Name: date, dtype: object


In [26]:
df.head()

Unnamed: 0,id,date,application,offered,answered,avg_ans_delay,avg_ans_delay_seconds,max_ans_delay,abandoned,delay_calls_product
0,2018-02-14-dba_cs_268_5311_gen_inq_script,02/14/2018,dba_cs_268_5311_gen_inq_script,423,375,00:02:44,164,00:09:47,48,61500
1,2018-02-15-dba_cs_268_5311_gen_inq_script,02/15/2018,dba_cs_268_5311_gen_inq_script,395,369,00:00:50,50,00:07:15,26,18450
2,2018-02-16-dba_cs_268_5311_gen_inq_script,02/16/2018,dba_cs_268_5311_gen_inq_script,369,349,00:01:21,81,00:07:14,19,28269
3,2018-02-19-dba_cs_268_5311_gen_inq_script,02/19/2018,dba_cs_268_5311_gen_inq_script,0,0,00:00:00,0,00:00:00,0,0
4,2018-02-20-dba_cs_268_5311_gen_inq_script,02/20/2018,dba_cs_268_5311_gen_inq_script,558,449,00:05:24,324,00:21:48,109,145476


In [None]:
# 2) Numeric columns that might come as strings with commas
comma_num_cols = [
    "offered",
    "answered",
    "abandoned",
    "delay_calls_product",
    "avg_ans_delay_seconds",
    # "max_ans_delay",
]
for c in [x for x in comma_num_cols if x in df.columns]:
    df[c] = df[c].astype(str).str.replace(",", "", regex=False).str.strip()
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

In [28]:
# avg_ans_delay -> compute seconds from HH:MM:SS or fall back to any existing numeric column
if 'avg_ans_delay' in df.columns:
    # parse hh:mm:ss -> seconds (NaN if parse fails)
    avg_td_secs = pd.to_timedelta(df['avg_ans_delay'], errors='coerce').dt.total_seconds()
    # try to use an existing numeric column if provided
    if 'avg_ans_delay_seconds' in df.columns:
        avg_num = pd.to_numeric(df['avg_ans_delay_seconds'].astype(str).str.replace(',', ''), errors='coerce')
        df['avg_ans_delay_seconds'] = avg_td_secs.fillna(avg_num).fillna(0).round().astype(int)
    else:
        df['avg_ans_delay_seconds'] = avg_td_secs.fillna(0).round().astype(int)

# max_ans_delay -> try parsing HH:MM:SS first, otherwise use numeric (strip commas)
if 'max_ans_delay' in df.columns:
    max_td_secs = pd.to_timedelta(df['max_ans_delay'], errors='coerce').dt.total_seconds()
    max_num = pd.to_numeric(df['max_ans_delay'].astype(str).str.replace(',', ''), errors='coerce')
    df['max_ans_delay'] = max_td_secs.fillna(max_num).fillna(0).round().astype(int)

In [29]:
# Quick sanity check
print(
    df[
        [
            "avg_ans_delay",
            "avg_ans_delay_seconds",
            "max_ans_delay",
            "delay_calls_product",
        ]
    ].head()
)

  avg_ans_delay  avg_ans_delay_seconds  max_ans_delay  delay_calls_product
0      00:02:44                    164            587                61500
1      00:00:50                     50            435                18450
2      00:01:21                     81            434                28269
3      00:00:00                      0              0                    0
4      00:05:24                    324           1308               145476


In [32]:
# # Drop unnecessary columns
df = df.drop(columns=["avg_ans_delay"], errors="ignore")

# Rename 'max_ans_delay" for consistency
df.rename(columns={"max_ans_delay": "max_ans_delay_seconds"}, inplace=True)
df.columns

df.head()

Unnamed: 0,id,date,application,offered,answered,avg_ans_delay_seconds,max_ans_delay_seconds,abandoned,delay_calls_product
0,2018-02-14-dba_cs_268_5311_gen_inq_script,02/14/2018,dba_cs_268_5311_gen_inq_script,423,375,164,587,48,61500
1,2018-02-15-dba_cs_268_5311_gen_inq_script,02/15/2018,dba_cs_268_5311_gen_inq_script,395,369,50,435,26,18450
2,2018-02-16-dba_cs_268_5311_gen_inq_script,02/16/2018,dba_cs_268_5311_gen_inq_script,369,349,81,434,19,28269
3,2018-02-19-dba_cs_268_5311_gen_inq_script,02/19/2018,dba_cs_268_5311_gen_inq_script,0,0,0,0,0,0
4,2018-02-20-dba_cs_268_5311_gen_inq_script,02/20/2018,dba_cs_268_5311_gen_inq_script,558,449,324,1308,109,145476


Let's add some enhancements to the data: 
- Call answer rate
- Call abandonment rate
- SLA met # For our example, SLA is 80% of calls answered within 20 seconds. Industry standard.



In [None]:
# Answer rate
df["answer_rate"] = round(df["answered"] / df["offered"], 2)

# Abandonment rate
df["abandonment_rate"] = round(df["abandoned"]/ df["offered"], 2)

# SLA Met
df["sla_met"] = df["avg_ans_delay_seconds"] <=  0.2
df.head()

Unnamed: 0,id,date,application,offered,answered,avg_ans_delay_seconds,max_ans_delay_seconds,abandoned,delay_calls_product,answer_rate,abandonment_rate,sla_met
0,2018-02-14-dba_cs_268_5311_gen_inq_script,02/14/2018,dba_cs_268_5311_gen_inq_script,423,375,164,587,48,61500,0.89,0.11,False
1,2018-02-15-dba_cs_268_5311_gen_inq_script,02/15/2018,dba_cs_268_5311_gen_inq_script,395,369,50,435,26,18450,0.93,0.07,False
2,2018-02-16-dba_cs_268_5311_gen_inq_script,02/16/2018,dba_cs_268_5311_gen_inq_script,369,349,81,434,19,28269,0.95,0.05,False
3,2018-02-19-dba_cs_268_5311_gen_inq_script,02/19/2018,dba_cs_268_5311_gen_inq_script,0,0,0,0,0,0,,,True
4,2018-02-20-dba_cs_268_5311_gen_inq_script,02/20/2018,dba_cs_268_5311_gen_inq_script,558,449,324,1308,109,145476,0.8,0.2,False
