In [None]:
# Importing neccessary Libraries
import numpy as np
import pandas as pd

In [None]:
# Reading the CSV dataset
df=pd.read_csv("dataset/Dealer_Export.csv")

#Creating a copy of the dataset
df2=df.copy()

In [None]:
# Defined a function to clean the pacakge info
def package_structure(x):
    # First check if the value is NaN
    if pd.isna(x):
        return "No Package Info"
    # Ensure x is a string before using string operations
    x= str(x) # Convert to string to safely use 'in' or 'startswith'
    if "Level 0" in x:
        return "CZ Level 0 Package"
    elif "Level 1" in x:
        return "CZ Level 1 Package"
    elif "Level 2" in x:
        return "CZ Level 2 Package"
    elif "Level 3" in x:
        return "CZ Level 3 Package"
    elif "Level 4" in x:
        return "CZ Level 4 Package"
    elif x.startswith('Accelerate'):
        return "CZ Accelerate Package"
    elif x.startswith('Ignite'):
        return "CZ Ignite Package"
    elif x.startswith('Rev'):
        return "CZ Rev Package"
    else:
        return "Other Package"

In [None]:
# Data exploration
print(f'The shape of the imported data set is: {df.shape}\n')
print(f'The imported dataset has following data types:\n {df.dtypes}\n')
print(f'The dataset has following count of null values:\n{df.isna().sum()}\n')

# Printing the top 5 records of the dataset
df.head()

In [None]:
# Dropping rows having null values and storing the output the same dataset
df.dropna(subset=["trading_name"], inplace=True)

print(f'The modified dataset has following count of null values:\n{df.isna().sum()}\n')

In [None]:
# Checking if there are any duplicate rows in the dataset
print(f'The dataset has following duplicate values:\n {df.duplicated().sum()}\n')

# Checking the unique values in the dataset
print(f'The dataset has following representation of unique values:\n {df.nunique()}\n')

In [None]:
# Correcting the data types for certain columns
df['did'] = df['did'].astype('object')
df['reg_code']=df['reg_code'].astype('object')
df['created_date']=pd.to_datetime(df['created_date'])

print(f'The modified dataset has following data types:\n {df.dtypes}\n')

In [None]:
# Adding two new columns for Ad Created Month-Year & Package Redefining
df['ad_created_in']=df['created_date'].dt.strftime('%b, %Y')
df['package_redefined'] = df['package'].apply(package_structure)

In [None]:
# Dropping unwanted columns from the data frame
df.drop(columns=['package','created_date'], inplace=True)

# Displaying the final dataset head
df.head()

In [None]:
# Saving the pre-processed dataset back to the dataset folder
df.to_csv('dataset/dealer_export_cleaned.csv', index=False)