In [271]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#### **Reading Data**

In [272]:
file_path = r'D:\PayMe\KOKO AUNG\practiceProject\Dataset\26_7_25_customer _call_list.xlsx'

In [273]:
data = pd.read_excel(file_path)
df = pd.DataFrame(data)

#### **Dataset Overview**

In [274]:
print(f"The dataset shape: {df.shape}\n")
print(f"The nummber of rows(records): {df.shape[0]}\n")
print(f"The number of columns(features): {df.shape[1]}\n")
print("The dataset info:\n")
print(df.info())
print("\nThe first 5 rows:")
display(df.head())

The dataset shape: (21, 8)

The nummber of rows(records): 21

The number of columns(features): 8

The dataset info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         21 non-null     int64 
 1   First_Name         21 non-null     object
 2   Last_Name          20 non-null     object
 3   Phone_Number       19 non-null     object
 4   Address            21 non-null     object
 5   Paying Customer    21 non-null     object
 6   Do_Not_Contact     17 non-null     object
 7   Not_Useful_Column  21 non-null     bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 1.3+ KB
None

The first 5 rows:


Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True


#### **Missing Values**

In [275]:
missing_data = df.isna().sum()
missing_percent = df.isna().mean().mul(100).round(2)
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percent': missing_percent}).sort_values(by= 'Missing Count', ascending=False)
print("Missing values summary:\n")
print(missing_df[missing_df['Missing Count']> 0])

Missing values summary:

                Missing Count  Missing Percent
Do_Not_Contact              4            19.05
Phone_Number                2             9.52
Last_Name                   1             4.76


#### **Duplicated Records**

In [276]:
print(f"\nHave duplicates?: {df.duplicated().any()}")
print(f"Duplicates count: {df.duplicated().sum()}\n")
print(f"Duplicated records:")
display(df[df.duplicated(keep=False)])


Have duplicates?: True
Duplicates count: 1

Duplicated records:


Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
19,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True
20,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True


In [277]:
df.drop_duplicates(inplace=True)
print(f"Duplicates: {df.duplicated().sum()}")

Duplicates: 0


#### **Data Cleaning**

In [278]:
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True


In [279]:
df.drop(columns=['Not_Useful_Column'], inplace=True)

In [280]:
df.dtypes

CustomerID          int64
First_Name         object
Last_Name          object
Phone_Number       object
Address            object
Paying Customer    object
Do_Not_Contact     object
dtype: object

In [281]:
df['CustomerID'] = df['CustomerID'].astype('string')
print(df['CustomerID'].dtypes)

string


In [282]:
df.columns

Index(['CustomerID', 'First_Name', 'Last_Name', 'Phone_Number', 'Address',
       'Paying Customer', 'Do_Not_Contact'],
      dtype='object')

In [283]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [284]:
df.rename(columns={'customerid': 'customer_id'}, inplace=True)

In [285]:
df.columns

Index(['customer_id', 'first_name', 'last_name', 'phone_number', 'address',
       'paying_customer', 'do_not_contact'],
      dtype='object')

##### **customer_id**

In [286]:
print(f"\nUnique count: {df['customer_id'].nunique()}\n")
print(f"Unique values:\n {df['customer_id'].unique()}")


Unique count: 20

Unique values:
 <StringArray>
['1001', '1002', '1003', '1004', '1005', '1006', '1007', '1008', '1009',
 '1010', '1011', '1012', '1013', '1014', '1015', '1016', '1017', '1018',
 '1019', '1020']
Length: 20, dtype: string


##### **first_name**

In [287]:
df['first_name'].unique()

array(['Frodo', 'Abed', 'Walter', 'Dwight', 'Jon', 'Ron', 'Jeff',
       'Sherlock', 'Gandalf', 'Peter', 'Samwise', 'Harry', 'Don',
       'Leslie', 'Toby', 'Michael ', 'Clark', 'Creed', 'Anakin'],
      dtype=object)

In [288]:
df['first_name'].nunique()

19

##### **last_name**

In [289]:
df['last_name'].unique()

array(['Baggins', 'Nadir', '/White', 'Schrute', 'Snow', 'Swanson',
       '  Winger', 'Holmes', nan, 'Parker', 'Gamgee', '...Potter',
       'Draper', 'Knope', 'Flenderson_', 'Weasley', 'Scott', 'Kent',
       'Braton', 'Skywalker'], dtype=object)

In [290]:
df[df['last_name'].astype(str).str.contains(r'[^a-zA-Z]')]['last_name']

2          /White
6          Winger
11      ...Potter
14    Flenderson_
Name: last_name, dtype: object

In [291]:
df[df['last_name'].astype(str).str.contains(r'[^a-zA-Z]')]['last_name'].str.strip('123._/')

2          White
6         Winger
11        Potter
14    Flenderson
Name: last_name, dtype: object

In [292]:
df['last_name'] = df['last_name'].str.strip('123._/ ')

In [293]:
df[df['last_name'].astype(str).str.contains(r'[^a-zA-Z]')]['last_name'].str.strip('123._/')

Series([], Name: last_name, dtype: object)

In [294]:
df['last_name'].unique()

array(['Baggins', 'Nadir', 'White', 'Schrute', 'Snow', 'Swanson',
       'Winger', 'Holmes', nan, 'Parker', 'Gamgee', 'Potter', 'Draper',
       'Knope', 'Flenderson', 'Weasley', 'Scott', 'Kent', 'Braton',
       'Skywalker'], dtype=object)

##### **phone_number**

In [295]:
df[df['phone_number'].astype(str).str.contains(r'[^0-9]')]['phone_number']

0     123-545-5421
1     123/643/9775
3     123-543-2345
4     876|678|3469
5     304-762-2467
6              NaN
7     876|678|3469
8              N/a
9     123-545-5421
10             NaN
12    123-543-2345
13    876|678|3469
14    304-762-2467
15    123-545-5421
16    123/643/9775
18             N/a
19    876|678|3469
Name: phone_number, dtype: object

In [296]:
df['phone_number'] = df['phone_number'].replace(r'[^0-9]', '', regex= True)
df['phone_number'] = df['phone_number'].replace(['', np.nan], pd.NA)
df['phone_number']

0     1235455421
1     1236439775
2     7066950392
3     1235432345
4     8766783469
5     3047622467
6           <NA>
7     8766783469
8           <NA>
9     1235455421
10          <NA>
11    7066950392
12    1235432345
13    8766783469
14    3047622467
15    1235455421
16    1236439775
17    7066950392
18          <NA>
19    8766783469
Name: phone_number, dtype: object

In [297]:
df['phone_number'].astype("string").isna().sum()

4

In [298]:
df['phone_number'] = df['phone_number'].astype("string").apply(lambda x: x[0:3] + '-' + x[3:6] + '-' + x[6:] if pd.notna(x) else pd.NA)

In [299]:
df['phone_number'].sample(5)

11    706-695-0392
10            <NA>
0     123-545-5421
16    123-643-9775
7     876-678-3469
Name: phone_number, dtype: object

##### **address**

In [300]:
df['address'].unique()

array(['123 Shire Lane, Shire', '93 West Main Street',
       '298 Drugs Driveway', '980 Paper Avenue, Pennsylvania, 18503',
       '123 Dragons Road', '768 City Parkway', '1209 South Street',
       '98 Clue Drive', '123 Middle Earth', '25th Main Street, New York',
       '612 Shire Lane, Shire', '2394 Hogwarts Avenue',
       '2039 Main Street', '343 City Parkway', '214 HR Avenue',
       '2395 Hogwarts Avenue', '121 Paper Avenue, Pennsylvania',
       '3498 Super Lane', 'N/a', '910 Tatooine Road, Tatooine'],
      dtype=object)

In [301]:
df['address'] = df['address'].replace('N/a', pd.NA)
df['address'].unique()

array(['123 Shire Lane, Shire', '93 West Main Street',
       '298 Drugs Driveway', '980 Paper Avenue, Pennsylvania, 18503',
       '123 Dragons Road', '768 City Parkway', '1209 South Street',
       '98 Clue Drive', '123 Middle Earth', '25th Main Street, New York',
       '612 Shire Lane, Shire', '2394 Hogwarts Avenue',
       '2039 Main Street', '343 City Parkway', '214 HR Avenue',
       '2395 Hogwarts Avenue', '121 Paper Avenue, Pennsylvania',
       '3498 Super Lane', <NA>, '910 Tatooine Road, Tatooine'],
      dtype=object)

In [302]:
df[['street_address', 'state', 'zip_code']] = df['address'].str.split(',', n=2, expand=True)
df.drop(columns= 'address', inplace=True)
df.head()

Unnamed: 0,customer_id,first_name,last_name,phone_number,paying_customer,do_not_contact,street_address,state,zip_code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,No,Yes,93 West Main Street,,
2,1003,Walter,White,706-695-0392,N,,298 Drugs Driveway,,
3,1004,Dwight,Schrute,123-543-2345,Yes,Y,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Y,No,123 Dragons Road,,


In [303]:
df[['state', 'zip_code']] = df[['state', 'zip_code']].replace([None, np.nan], pd.NA)

##### **paying_customer**

In [304]:
df['paying_customer'].unique()

array(['Yes', 'No', 'N', 'Y', 'N/a'], dtype=object)

In [305]:
df['paying_customer'] = df['paying_customer'].replace({'Y': 'Yes', 'N': 'No', 'N/a': pd.NA})
df['paying_customer'].unique()

array(['Yes', 'No', <NA>], dtype=object)

##### **do_not_contact**

In [306]:
df['do_not_contact'].unique()

array(['No', 'Yes', nan, 'Y', 'N'], dtype=object)

In [307]:
df['do_not_contact'].replace({'Y': 'Yes', 'N': 'No', np.nan: pd.NA}).unique()

array(['No', 'Yes', <NA>], dtype=object)

In [308]:
df['do_not_contact'] = df['do_not_contact'].replace({'Y': 'Yes', 'N': 'No', np.nan: pd.NA})
df['do_not_contact'].unique()

array(['No', 'Yes', <NA>], dtype=object)

##### **Cleaned Data**

In [309]:
df.head()

Unnamed: 0,customer_id,first_name,last_name,phone_number,paying_customer,do_not_contact,street_address,state,zip_code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,No,Yes,93 West Main Street,,
2,1003,Walter,White,706-695-0392,No,,298 Drugs Driveway,,
3,1004,Dwight,Schrute,123-543-2345,Yes,Yes,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Yes,No,123 Dragons Road,,
