## Importing Libraries & Loading Dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
try:
    file_path = r'D:\personalData\customerCallProject\dataCleaningTask\dataset\26_7_25_customer _call_list.xlsx'
    df = pd.read_excel(file_path)
except FileNotFoundError:
    print("File not found!")

## Dataset Overview & Intial EDA

In [3]:
# the first few rows
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True


In [4]:
print(f"This dataset contains {df.shape[0]:,} rows and {df.shape[1]:,} columns")

This dataset contains 21 rows and 8 columns


In [5]:
# Dataset basic information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         21 non-null     int64 
 1   First_Name         21 non-null     object
 2   Last_Name          20 non-null     object
 3   Phone_Number       19 non-null     object
 4   Address            21 non-null     object
 5   Paying Customer    21 non-null     object
 6   Do_Not_Contact     17 non-null     object
 7   Not_Useful_Column  21 non-null     bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 1.3+ KB
None


### Missing Values

In [6]:
missing_data = df.isna().sum()
missing_percent = df.isna().mean().mul(100).round(2)
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percent': missing_percent}).sort_values(by= 'Missing Count', ascending=False)
print("Missing values summary:\n")
print(missing_df[missing_df['Missing Count']> 0])

Missing values summary:

                Missing Count  Missing Percent
Do_Not_Contact              4            19.05
Phone_Number                2             9.52
Last_Name                   1             4.76


### Duplicated Records

In [7]:
print(f"\nHave duplicates?: {df.duplicated().any()}")
print(f"Duplicates count: {df.duplicated().sum()}\n")
print(f"Duplicated records:")
display(df[df.duplicated(keep=False)])


Have duplicates?: True
Duplicates count: 1

Duplicated records:


Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
19,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True
20,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True


### Descriptive Statistics For Categorical Variables

In [8]:
df.describe(include="O").T

Unnamed: 0,count,unique,top,freq
First_Name,21,19,Anakin,2
Last_Name,20,19,Skywalker,2
Phone_Number,19,7,876|678|3469,5
Address,21,20,"910 Tatooine Road, Tatooine",2
Paying Customer,21,5,Yes,11
Do_Not_Contact,17,4,No,9


In [9]:
# Show unique count and values for each categorical column for better understanding
print("\nUnique values for each Categorical Column")
print("-"*50)
for col in df.select_dtypes(include= 'object').columns:
    print(f"\n=== {col} === \n")
    print(f"Unique count: {df[col].nunique()}\n")
    print(f"Unique values: {df[col].unique()[0:15]}")
    print('=' * 80)


Unique values for each Categorical Column
--------------------------------------------------

=== First_Name === 

Unique count: 19

Unique values: ['Frodo' 'Abed' 'Walter' 'Dwight' 'Jon' 'Ron' 'Jeff' 'Sherlock' 'Gandalf'
 'Peter' 'Samwise' 'Harry' 'Don' 'Leslie' 'Toby']

=== Last_Name === 

Unique count: 19

Unique values: ['Baggins' 'Nadir' '/White' 'Schrute' 'Snow' 'Swanson' '  Winger' 'Holmes'
 nan 'Parker' 'Gamgee' '...Potter' 'Draper' 'Knope' 'Flenderson_']

=== Phone_Number === 

Unique count: 7

Unique values: ['123-545-5421' '123/643/9775' 7066950392 '123-543-2345' '876|678|3469'
 '304-762-2467' nan 'N/a']

=== Address === 

Unique count: 20

Unique values: ['123 Shire Lane, Shire' '93 West Main Street' '298 Drugs Driveway'
 '980 Paper Avenue, Pennsylvania, 18503' '123 Dragons Road'
 '768 City Parkway' '1209 South Street' '98 Clue Drive' '123 Middle Earth'
 '25th Main Street, New York' '612 Shire Lane, Shire'
 '2394 Hogwarts Avenue' '2039 Main Street' '343 City Parkway'
 '214

## Data Cleaning

In [10]:
df_cleaned=df.copy()

In [11]:
df_cleaned.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True


In [12]:
# Drop not useful columns
df_cleaned.drop(columns=['Not_Useful_Column'], inplace=True)

In [13]:
df_cleaned.columns = df_cleaned.columns.str.lower().str.replace(' ', '_')
df_cleaned.rename(columns={'customerid': 'customer_id'}, inplace=True)
print(df_cleaned.columns)

Index(['customer_id', 'first_name', 'last_name', 'phone_number', 'address',
       'paying_customer', 'do_not_contact'],
      dtype='object')


### Handling Missing Values

In [14]:
# Drop rows with missing values
df_cleaned.dropna(axis=0, inplace=True)

### Handling Duplicates

In [15]:
df.drop_duplicates(inplace=True)
print(f"Duplicates: {df.duplicated().sum()}")

Duplicates: 0


### Converting Data Type

In [17]:
# changing data type
df_cleaned['customer_id'] = df_cleaned['customer_id'].astype('string')
print(df_cleaned['customer_id'].dtypes)

string


### Handling Noisy Data

#### last_name column

In [18]:
df_cleaned[df_cleaned['last_name'].astype(str).str.contains(r'[^a-zA-Z]')]['last_name']

14    Flenderson_
Name: last_name, dtype: object

In [19]:
df_cleaned[df_cleaned['last_name'].astype(str).str.contains(r'[^a-zA-Z]')]['last_name'].str.strip('123._/')

14    Flenderson
Name: last_name, dtype: object

In [20]:
df_cleaned['last_name'] = df_cleaned['last_name'].str.strip('123._/ ')

In [21]:
df_cleaned[df_cleaned['last_name'].astype(str).str.contains(r'[^a-zA-Z]')]['last_name'].str.strip('123._/')

Series([], Name: last_name, dtype: object)

In [22]:
df_cleaned['last_name'].unique()

array(['Baggins', 'Nadir', 'Schrute', 'Snow', 'Swanson', 'Holmes',
       'Parker', 'Draper', 'Knope', 'Flenderson', 'Weasley', 'Scott',
       'Braton', 'Skywalker'], dtype=object)

#### **phone_number** column

In [23]:
df_cleaned[df_cleaned['phone_number'].astype(str).str.contains(r'[^0-9]')]['phone_number']

0     123-545-5421
1     123/643/9775
3     123-543-2345
4     876|678|3469
5     304-762-2467
7     876|678|3469
9     123-545-5421
12    123-543-2345
13    876|678|3469
14    304-762-2467
15    123-545-5421
16    123/643/9775
18             N/a
19    876|678|3469
20    876|678|3469
Name: phone_number, dtype: object

In [24]:
df_cleaned['phone_number'] = df_cleaned['phone_number'].replace(r'[^0-9]', '', regex= True)
df_cleaned['phone_number'] = df_cleaned['phone_number'].replace(['', np.nan], pd.NA)
df_cleaned['phone_number']

0     1235455421
1     1236439775
3     1235432345
4     8766783469
5     3047622467
7     8766783469
9     1235455421
12    1235432345
13    8766783469
14    3047622467
15    1235455421
16    1236439775
18          <NA>
19    8766783469
20    8766783469
Name: phone_number, dtype: object

In [25]:
df_cleaned['phone_number'].astype("string").isna().sum()

1

In [26]:
df_cleaned['phone_number'] = df_cleaned['phone_number'].astype("string").apply(lambda x: x[0:3] + '-' + x[3:6] + '-' + x[6:] if pd.notna(x) else pd.NA)

In [27]:
df_cleaned['phone_number'].sample(5)

15    123-545-5421
20    876-678-3469
9     123-545-5421
18            <NA>
1     123-643-9775
Name: phone_number, dtype: object

#### **address** column

In [28]:
df_cleaned['address'].unique()

array(['123 Shire Lane, Shire', '93 West Main Street',
       '980 Paper Avenue, Pennsylvania, 18503', '123 Dragons Road',
       '768 City Parkway', '98 Clue Drive', '25th Main Street, New York',
       '2039 Main Street', '343 City Parkway', '214 HR Avenue',
       '2395 Hogwarts Avenue', '121 Paper Avenue, Pennsylvania', 'N/a',
       '910 Tatooine Road, Tatooine'], dtype=object)

In [29]:
df_cleaned['address'] = df_cleaned['address'].replace('N/a', pd.NA)
df_cleaned['address'].unique()

array(['123 Shire Lane, Shire', '93 West Main Street',
       '980 Paper Avenue, Pennsylvania, 18503', '123 Dragons Road',
       '768 City Parkway', '98 Clue Drive', '25th Main Street, New York',
       '2039 Main Street', '343 City Parkway', '214 HR Avenue',
       '2395 Hogwarts Avenue', '121 Paper Avenue, Pennsylvania', <NA>,
       '910 Tatooine Road, Tatooine'], dtype=object)

In [30]:
df_cleaned[['street_address', 'state', 'zip_code']] = df_cleaned['address'].str.split(',', n=2, expand=True)
df_cleaned.drop(columns= 'address', inplace=True)
df_cleaned.head()

Unnamed: 0,customer_id,first_name,last_name,phone_number,paying_customer,do_not_contact,street_address,state,zip_code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,No,Yes,93 West Main Street,,
3,1004,Dwight,Schrute,123-543-2345,Yes,Y,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Y,No,123 Dragons Road,,
5,1006,Ron,Swanson,304-762-2467,Yes,Yes,768 City Parkway,,


In [31]:
df_cleaned[['state', 'zip_code']] = df_cleaned[['state', 'zip_code']].replace([None, np.nan], pd.NA)

#### **paying_customer** column

In [32]:
df_cleaned['paying_customer'] = df_cleaned['paying_customer'].replace({'Y': 'Yes', 'N': 'No', 'N/a': pd.NA})
df_cleaned['paying_customer'].unique()

array(['Yes', 'No', <NA>], dtype=object)

#### **do_not_contact** column

In [33]:
df_cleaned['do_not_contact'].replace({'Y': 'Yes', 'N': 'No', np.nan: pd.NA}).unique()

array(['No', 'Yes'], dtype=object)

In [34]:
df_cleaned['do_not_contact'] = df_cleaned['do_not_contact'].replace({'Y': 'Yes', 'N': 'No', np.nan: pd.NA})
df_cleaned['do_not_contact'].unique()

array(['No', 'Yes'], dtype=object)

## **Final Cleaned Data**

In [35]:
df_cleaned.head()

Unnamed: 0,customer_id,first_name,last_name,phone_number,paying_customer,do_not_contact,street_address,state,zip_code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,No,Yes,93 West Main Street,,
3,1004,Dwight,Schrute,123-543-2345,Yes,Yes,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Yes,No,123 Dragons Road,,
5,1006,Ron,Swanson,304-762-2467,Yes,Yes,768 City Parkway,,


In [36]:
df_cleaned.describe(include="O").T

Unnamed: 0,count,unique,top,freq
first_name,15,13,Ron,2
last_name,15,14,Skywalker,2
phone_number,14,5,876-678-3469,5
paying_customer,14,2,Yes,10
do_not_contact,15,2,No,11
street_address,14,13,910 Tatooine Road,2
state,6,4,Pennsylvania,2
zip_code,1,1,18503,1
