### Load packages


In [None]:
import os
import pandas as pd

### Load files


In [9]:
raw_file = "/home/joan/Projects/python/call_center_pipeline/data/5311_Call_Centre_-_Raw_dataset_20250901.csv"
if os.path.exists(raw_file):
    df = pd.read_csv(raw_file)
    df.head()
else:
    print(f"File not found: {raw_file}")

### Basic data inspection


In [16]:
# Check basic data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     476 non-null    object
 1   Date                   476 non-null    object
 2   APPLICATION            476 non-null    object
 3   OFFERED                476 non-null    int64 
 4   ANSWERED               476 non-null    int64 
 5   AVG ANS DELAY          476 non-null    object
 6   AVG ANS DELAY SECONDS  476 non-null    object
 7   MAX ANS DELAY          476 non-null    object
 8   ABANDONED              476 non-null    int64 
 9   DelayCallsProduct      476 non-null    object
dtypes: int64(3), object(7)
memory usage: 37.3+ KB


In [None]:
# Display head
df.head()

Unnamed: 0,id,Date,APPLICATION,OFFERED,ANSWERED,AVG ANS DELAY,AVG ANS DELAY SECONDS,MAX ANS DELAY,ABANDONED,DelayCallsProduct
0,2018-02-14-dba_cs_268_5311_gen_inq_script,2018/02/14,dba_cs_268_5311_gen_inq_script,423,375,00:02:44,164,00:09:47,48,61500
1,2018-02-15-dba_cs_268_5311_gen_inq_script,2018/02/15,dba_cs_268_5311_gen_inq_script,395,369,00:00:50,50,00:07:15,26,18450
2,2018-02-16-dba_cs_268_5311_gen_inq_script,2018/02/16,dba_cs_268_5311_gen_inq_script,369,349,00:01:21,81,00:07:14,19,28269
3,2018-02-19-dba_cs_268_5311_gen_inq_script,2018/02/19,dba_cs_268_5311_gen_inq_script,0,0,00:00:00,0,00:00:00,0,0
4,2018-02-20-dba_cs_268_5311_gen_inq_script,2018/02/20,dba_cs_268_5311_gen_inq_script,558,449,00:05:24,324,00:21:48,109,145476


In [13]:
# Sumary Statistics
df.describe()

Unnamed: 0,OFFERED,ANSWERED,ABANDONED
count,476.0,476.0,476.0
mean,294.306723,180.12395,70.115546
std,154.309158,116.931538,51.975108
min,0.0,0.0,0.0
25%,232.75,99.75,23.75
50%,329.0,167.0,65.0
75%,408.0,284.25,107.25
max,672.0,473.0,210.0


### Data Cleaning


In [None]:
# Lets check for missing values
df.isnull().sum()

id                       0
Date                     0
APPLICATION              0
OFFERED                  0
ANSWERED                 0
AVG ANS DELAY            0
AVG ANS DELAY SECONDS    0
MAX ANS DELAY            0
ABANDONED                0
DelayCallsProduct        0
dtype: int64

Lucky us that there are no missing values! Data is clean. Bit of a miracle. We still have lot of work to do.


In [None]:
# Let's check for dupes
duplicates = df[df.duplicated()]


if not duplicates.empty:
    print(f"Dupes found: {len(duplicates)} rows. Please deal with them")
else:
    print("No dupes. We are good to go.")

No dupes. We are good to go.


In [25]:
# Let's transform those columns names to lower case and replace the blank spaces to lower case
print("Original DataFrame columns:")
print(df.columns)

# Convert to lower case
df.columns = df.columns.str.lower()

# Convert to lower case
df.columns = df.columns.str.replace(" ", "_")

print("\nDataFrame columns after converting to lowercase:")
print(df.columns)

Original DataFrame columns:
Index(['id', 'date', 'application', 'offered', 'answered', 'avg ans delay',
       'avg ans delay seconds', 'max ans delay', 'abandoned',
       'delaycallsproduct'],
      dtype='object')

DataFrame columns after converting to lowercase:
Index(['id', 'date', 'application', 'offered', 'answered', 'avg_ans_delay',
       'avg_ans_delay_seconds', 'max_ans_delay', 'abandoned',
       'delaycallsproduct'],
      dtype='object')


In [27]:
# Let handle that 'delaycallsproduct' column and make
df.rename(columns={"delaycallsproduct": "delay_calls_product"}, inplace=True)
df.columns

Index(['id', 'date', 'application', 'offered', 'answered', 'avg_ans_delay',
       'avg_ans_delay_seconds', 'max_ans_delay', 'abandoned',
       'delay_calls_product'],
      dtype='object')

In [None]:
# Convert the dates into datetime objects
print(df.columns)  # confirm the column name is 'date'
df["date"] = pd.to_datetime(
    df["date"], errors="coerce", dayfirst=True, infer_datetime_format=True
)
print(df["date"].head())

Index(['id', 'date', 'application', 'offered', 'answered', 'avg_ans_delay',
       'avg_ans_delay_seconds', 'max_ans_delay', 'abandoned',
       'delay_calls_product'],
      dtype='object')
0   2018-02-14
1   2018-02-15
2   2018-02-16
3   2018-02-19
4   2018-02-20
Name: date, dtype: datetime64[ns]


  df["date"] = pd.to_datetime(
  df["date"] = pd.to_datetime(
