DATA CLEANING with AlexTheAnalyst
----

Clean Customer Details

Actions
- Exploratory Analysis
- Drop duplicates
- Clean columns
- Standardize Columns
- Drop Columns

In [52]:
import pandas as pd
import numpy as np

df = pd.read_excel("Customer Call List.xlsx")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Display DataFrame

In [53]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True


DROP Duplicates

In [54]:
df = df.drop_duplicates()

In [55]:
pd.options.mode.copy_on_write = True

In [56]:
df['Last_Name'] = df['Last_Name'].str.replace(r'[\.\/_]' ,'', regex=True,)
df['Last_Name'] = df['Last_Name'].fillna('')

In [59]:
df['Phone_Number'] = df['Phone_Number'].replace(to_replace=r'[^a-zA-Z0-9]|Na', value='', regex=True)
df['Phone_Number'] = df['Phone_Number'].fillna('')

In [60]:
df['Phone_Number']

0     1235455421
1     1236439775
2     7066950392
3     1235432345
4     8766783469
5     3047622467
6               
7     8766783469
8               
9     1235455421
10              
11    7066950392
12    1235432345
13    8766783469
14    3047622467
15    1235455421
16    1236439775
17    7066950392
18              
19    8766783469
Name: Phone_Number, dtype: object

CONVERT Column data type to string

In [61]:
df['Phone_Number'] = df['Phone_Number'].astype(str)

VIEW data frame

In [62]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,1235455421.0,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,1236439775.0,93 West Main Street,No,Yes,False
2,1003,Walter,White,7066950392.0,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,1235432345.0,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,8766783469.0,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,3047622467.0,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,8766783469.0,98 Clue Drive,N,No,False
8,1009,Gandalf,,,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,1235455421.0,"25th Main Street, New York",Yes,No,True


STANDARDIZE phone number column 000-000-0000

In [63]:
df['Phone_Number'] = df['Phone_Number'].apply(lambda x: x[:3] + '-' + x[3:6] + '-' + x[6:10] if len(x) else '')

In [152]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column,Valid-Numnber
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True,123-545-5421
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes,False,123-643-9775
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,N,,True,706-695-0392
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True,123-543-2345
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Y,No,True,876-678-3469
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True,304-762-2467
6,1007,Jeff,Winger,,1209 South Street,No,No,False,
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,N,No,False,876-678-3469
8,1009,Gandalf,,,123 Middle Earth,Yes,,False,
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True,123-545-5421


STANDARDIZE column - Y -> Yes, N -> No

In [80]:
df['Paying Customer'] = df['Paying Customer'].apply(lambda x: 'Yes' if x.lower() == 'y' else 'No' if x.lower() == 'n' else x)

VIEW dataframe

In [81]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column,Lo
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True,No
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes,False,No
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,No,,True,No
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True,No
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Yes,No,True,Yes
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True,No
6,1007,Jeff,Winger,,1209 South Street,No,No,False,No
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,No,No,False,No
8,1009,Gandalf,,,123 Middle Earth,Yes,,False,No
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True,No


In [83]:
df['Do_Not_Contact'] = df['Do_Not_Contact'].astype(str)

STANDARDIZE the Do_Not_Contact column

In [87]:
df['Do_Not_Contact'] = df['Do_Not_Contact'].apply(lambda x: 'No' if x.lower() == 'n' else 'Yes' if x.lower() == 'y' else x)

VIEW Dataframe

In [88]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column,Lo
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True,No
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes,False,No
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,No,,True,No
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Yes,True,No
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Yes,No,True,Yes
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True,No
6,1007,Jeff,Winger,,1209 South Street,No,No,False,No
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,No,No,False,No
8,1009,Gandalf,,,123 Middle Earth,Yes,,False,No
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True,No


CLEAN UP the dataframe

In [91]:
df = df.replace(to_replace=r'N/a|nan', value='', regex=True)

VIEW dataframe

In [92]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column,Lo
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True,No
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes,False,No
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,No,,True,No
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Yes,True,No
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Yes,No,True,Yes
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True,No
6,1007,Jeff,Winger,,1209 South Street,No,No,False,No
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,No,No,False,No
8,1009,Gandalf,,,123 Middle Earth,Yes,,False,No
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True,No


RETRIEVE ONLY the required data

In [107]:
finalDF = df[(df['Do_Not_Contact'] == 'No') & (df['Phone_Number'] != '')]

In [109]:
finalDF.reset_index(inplace=True)

VIEW Cleaned Table

In [110]:
finalDF

Unnamed: 0,level_0,index,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,0,0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,4,4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Yes,No
2,7,7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,No,No
3,9,9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No
4,12,12,1013,Don,Draper,123-543-2345,2039 Main Street,Yes,No
5,13,13,1014,Leslie,Knope,876-678-3469,343 City Parkway,Yes,No
6,14,14,1015,Toby,Flenderson,304-762-2467,214 HR Avenue,No,No
7,15,15,1016,Ron,Weasley,123-545-5421,2395 Hogwarts Avenue,No,No
8,16,16,1017,Michael,Scott,123-643-9775,"121 Paper Avenue, Pennsylvania",Yes,No
9,19,19,1020,Anakin,Skywalker,876-678-3469,"910 Tatooine Road, Tatooine",Yes,No
