In [23]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [62]:
accountFile = os.path.join(os.getcwd(), 'Datasets',"accounts.csv")
dict_File = os.path.join(os.getcwd(), 'Datasets',"data_dictionary.csv")
productsFile = os.path.join(os.getcwd(), 'Datasets',"products.csv")
pipelineFile = os.path.join(os.getcwd(), 'Datasets',"sales_pipeline.csv")
teamsFile = os.path.join(os.getcwd(), 'Datasets',"sales_teams.csv")

df_accounts = pd.read_csv(accountFile)
df_dictData = pd.read_csv(dict_File)
df_products = pd.read_csv(productsFile)
df_pipeline = pd.read_csv(pipelineFile)
df_teams = pd.read_csv(teamsFile)

In [63]:
datasets = {
    "Accounts" : df_accounts,
    "DictData" : df_dictData,
    "Products" : df_products,
    "Pipeline" : df_pipeline,
    "Teams" : df_teams
}

In [64]:
#standerizes the column names
def standerizeColumnNames(dataDict_List):
    for names, df in dataDict_List.items():
        print(f"standerizing columns {names}")
        for col in df.columns:
            if df[col].dtype == object:
                df[col] = df[col].str.strip().str.lower().str.replace(' ','_')
        print(f"Completed: {names}")

In [65]:
def dataset_analysis(dataDict_List):
    for name, df in dataDict_List.items():
        print(f"Missing Values for {name}")
        missingValues = df.isnull().sum()
        missingValues = missingValues.sort_values(ascending=False)

        print(missingValues)

In [66]:
#Note: when I use dataDict_List I refer to a list of dictionaries for our dataframes, every functionality changed in the dataset has its corresponding functions

#performs median imputation which is the process of replacing each missing value with a numerical value that is the meadian of all the non-missing values; it also drops all duplicate values
def clean_datasets(dataDict_List):
    print("Cleaning Datasets")
    for df in dataDict_List.values():
        #duplicates
        df.drop_duplicates(inplace=True)
        #missing values
        for col in df.columns:
            if df[col].dtype in ["int64", "float64", "number"]:
                median_values = df[col].median()
                df[col] = df[col].fillna(median_values)
            else:
                mode = df[col].mode()[0]
                df[col] = df[col].fillna(mode)

    print("Dataset Cleaned")


In [67]:
#standardizes all dates in the dataset
def standardize_dates(dataDict_List):
    #standardize date
    for df in dataDict_List.values():
        for col in df.columns:
            if "date" in col.lower():
                df[col] = pd.to_datetime(df[col], errors="coerce")

In [68]:
def currency_standard(dataDict_List):
    for df in dataDict_List.values():
        for col in df.columns:
            moneyColumns = [c for c in df.columns if "amount" in c or "revenue" in c]
            for col in moneyColumns:
                df[col] = df[col].astype(float)

In [69]:
dataset_analysis(datasets)

Missing Values for Accounts
subsidiary_of       70
account              0
sector               0
year_established     0
revenue              0
employees            0
office_location      0
dtype: int64
Missing Values for DictData
Table          0
Field          0
Description    0
dtype: int64
Missing Values for Products
product        0
series         0
sales_price    0
dtype: int64
Missing Values for Pipeline
close_date        2089
close_value       2089
account           1425
engage_date        500
opportunity_id       0
sales_agent          0
product              0
deal_stage           0
dtype: int64
Missing Values for Teams
sales_agent        0
manager            0
regional_office    0
dtype: int64


In [70]:
clean_datasets(datasets)
standerizeColumnNames(datasets)
standardize_dates(datasets)
currency_standard(datasets)

Cleaning Datasets
Dataset Cleaned
standerizing columns Accounts
Completed: Accounts
standerizing columns DictData
Completed: DictData
standerizing columns Products
Completed: Products
standerizing columns Pipeline
Completed: Pipeline
standerizing columns Teams
Completed: Teams


In [71]:
def analysisDataSets(dataDict_List):
    for name, df in dataDict_List.items():
        print(f"Information of {name}:")
        df.info()
analysisDataSets(datasets)

Information of Accounts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   account           85 non-null     object 
 1   sector            85 non-null     object 
 2   year_established  85 non-null     int64  
 3   revenue           85 non-null     float64
 4   employees         85 non-null     int64  
 5   office_location   85 non-null     object 
 6   subsidiary_of     85 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 4.8+ KB
Information of DictData:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Table        21 non-null     object
 1   Field        21 non-null     object
 2   Description  21 non-null     object
dtypes: object(3)
memory usage: 636.0+ bytes
Information

In [73]:
def save_csv(dataDict_List):
    os.makedirs("data_cleaned", exist_ok=True)
    for name, df in dataDict_List.items():
        try:
           df.to_csv(f"data_cleaned/{name}.csv", index=False)
        except Exception as e:
            print(f"Failed to save {name}: {e}")

In [74]:
save_csv(datasets)