Basic Data Cleaning in Python

In [29]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np


In [30]:
# Step 2: Load the dataset
df = pd.read_csv('cleaned_dataset.csv')
#df = pd.read_excel('Customer Call List.xlsx')

# Display the first few rows of the dataset
print("Original Dataset:")
df.head()

Original Dataset:


Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,frodo,baggins,123-545-5421,"123 shire lane, shire",yes,no,True
1,1002,abed,nadir,123/643/9775,93 west main street,no,yes,False
2,1003,walter,/white,876|678|3469,298 drugs driveway,n,no,True
3,1004,dwight,schrute,123-543-2345,"980 paper avenue, pennsylvania, 18503",yes,y,True
4,1005,jon,snow,876|678|3469,123 dragons road,y,no,True


Handle missing values

In [31]:
# Fill missing values in numerical columns with the mean
numerical_columns = df.select_dtypes(include=[np.number]).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# Fill missing values in categorical columns with the mode
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

# Check for remaining missing values
print("Missing values after cleaning:")
df.isnull().sum()

Missing values after cleaning:


CustomerID           0
First_Name           0
Last_Name            0
Phone_Number         0
Address              0
Paying Customer      0
Do_Not_Contact       0
Not_Useful_Column    0
dtype: int64

Remove Duplicates

In [32]:
# Remove duplicate rows
df.drop_duplicates()

# Check the shape of the dataset after removing duplicates
print("Shape of dataset after removing duplicates:")
df.shape

Shape of dataset after removing duplicates:


(21, 8)

Standardize Text Data

In [37]:
# Standardize text data (e.g., lowercasing, stripping whitespace)
df['First_Name'] = df[categorical_columns].apply(lambda x: x.str.strip().str.lower())

# Display a sample of the cleaned text data
print("Sample of standardized text data:")
df[categorical_columns].head()

ValueError: Columns must be same length as key

Drop Unnecessary Columns

In [None]:
# Drop columns that are not needed
df = df.drop(columns=['unnecessary_column1', 'unnecessary_column2'])

# Display the columns after dropping
print("Columns after dropping unnecessary ones:")
df.columns


Rename Columns

In [None]:
# Rename columns for better readability
df = df.rename(columns={'old_name': 'new_name'})

# Display the updated column names
print("Updated column names:")
df.columns

Save and display final cleaned dataset

In [28]:
df.to_csv('cleaned_dataset.csv', index=False)

print("Cleaned dataset saved as 'cleaned_dataset.csv'")

# Display the first few rows of the cleaned dataset
print("Final Cleaned Dataset:")
df.head()

Cleaned dataset saved as 'cleaned_dataset.csv'
Final Cleaned Dataset:


Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,frodo,baggins,123-545-5421,"123 shire lane, shire",yes,no,True
1,1002,abed,nadir,123/643/9775,93 west main street,no,yes,False
2,1003,walter,/white,876|678|3469,298 drugs driveway,n,no,True
3,1004,dwight,schrute,123-543-2345,"980 paper avenue, pennsylvania, 18503",yes,y,True
4,1005,jon,snow,876|678|3469,123 dragons road,y,no,True
