### This notebook is for Milestone #1 and focuses on data preprocessing. It covers three main stages: data cleaning to handle duplicates, missing values, and data types; data transformation to encode categorical features and scale numerical ones; and data reduction through feature engineering and column selection. The goal is to prepare the raw data for machine learning model implementation.

In [24]:
# Import the Necessary Packages for Data Cleaning
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

### Step 1: Load Datasets

In [25]:
# This is the directory and loading each CSV file into a pandas DataFrame
data_directory = "/content/data_directory/"

accounts = pd.read_csv(os.path.join(data_directory, "accounts.csv"))
products = pd.read_csv(os.path.join(data_directory, "products.csv"))
pipeline = pd.read_csv(os.path.join(data_directory, "sales_pipeline.csv"))
teams = pd.read_csv(os.path.join(data_directory, "sales_teams.csv"))
dictionary = pd.read_csv(os.path.join(data_directory, "data_dictionary.csv"))

In [26]:
# This is storing DataFrames in a dictionary for easy iteration
dataframes = {
    "accounts": accounts,
    "products": products,
    "sales_pipeline": pipeline,
    "sales_teams": teams,
    "data_dictionary": dictionary
}

### Step 2: Data Cleaning

In [27]:
# This is a function to standardize column names
def standardize_columns(df):
    """This is converting column names to lowercase and replacing spaces with underscores."""
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    return df

In [28]:
# This is applying the column standardization to all DataFrames
for name, df in dataframes.items():
    dataframes[name] = standardize_columns(df)

In [29]:
# This is fixing typos and standardizing categorical values in the accounts dataframe
if 'sector' in dataframes['accounts'].columns:
    dataframes['accounts']['sector'] = dataframes['accounts']['sector'].replace({"technolgy": "technology"})

if 'office_location' in dataframes['accounts'].columns:
    dataframes['accounts']['office_location'] = dataframes['accounts']['office_location'].replace({"Philipines": "Philippines"})

In [30]:
# This is handling duplicates in all DataFrames
for name, df in dataframes.items():
    initial_rows = len(df)
    dataframes[name].drop_duplicates(inplace=True)
    rows_after = len(dataframes[name])
    if initial_rows > rows_after:
        print(f"Removed {initial_rows - rows_after} duplicate rows from {name}.")

In [31]:
# This is handling missing values
# This is filling numerical columns with median and categorical columns with 'unknown'
for name, df in dataframes.items():
    print(f"\n--- Handling missing values for {name} ---")

    # This is checking for missing values before cleaning
    initial_missing = df.isnull().sum()
    print(f"Missing values before:\n{initial_missing[initial_missing > 0]}")

    # This is imputing missing values based on data type
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            df[col].fillna(df[col].median(), inplace=True)
        elif df[col].dtype == 'object':
            df[col].fillna('unknown', inplace=True)

    # This is checking for missing values after cleaning
    final_missing = df.isnull().sum()
    print(f"Missing values after:\n{final_missing[final_missing > 0]}")


--- Handling missing values for accounts ---
Missing values before:
subsidiary_of    70
dtype: int64
Missing values after:
Series([], dtype: int64)

--- Handling missing values for products ---
Missing values before:
Series([], dtype: int64)
Missing values after:
Series([], dtype: int64)

--- Handling missing values for sales_pipeline ---
Missing values before:
account        1425
engage_date     500
close_date     2089
close_value    2089
dtype: int64
Missing values after:
Series([], dtype: int64)

--- Handling missing values for sales_teams ---
Missing values before:
Series([], dtype: int64)
Missing values after:
Series([], dtype: int64)

--- Handling missing values for data_dictionary ---
Missing values before:
Series([], dtype: int64)
Missing values after:
Series([], dtype: int64)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


### Step 3: Data Transformation & Reduction

In [32]:
# This is converting date columns to datetime objects in the sales_pipeline dataframe
df_pipeline = dataframes['sales_pipeline']
for col in ['engage_date', 'close_date']:
    if col in df_pipeline.columns:
        df_pipeline[col] = pd.to_datetime(df_pipeline[col], errors='coerce')

# This is dropping columns with high missing values or unique identifiers
dataframes['accounts'].drop('subsidiary_of', axis=1, inplace=True)
dataframes['sales_pipeline'].drop('opportunity_id', axis=1, inplace=True)

### Step 4: Save Cleaned Data

In [33]:
# This is creating a directory to save cleaned datasets
cleaned_dir = "data_cleaned"
os.makedirs(cleaned_dir, exist_ok=True)

# This is saving each cleaned DataFrame as a CSV
for name, df in dataframes.items():
    # This is skipping the data dictionary as it is not needed for the model
    if name != 'data_dictionary':
        file_path = os.path.join(cleaned_dir, f"cleaned_{name}.csv")
        df.to_csv(file_path, index=False)
        print(f"\nSaved cleaned data to {file_path}")


Saved cleaned data to data_cleaned/cleaned_accounts.csv

Saved cleaned data to data_cleaned/cleaned_products.csv

Saved cleaned data to data_cleaned/cleaned_sales_pipeline.csv

Saved cleaned data to data_cleaned/cleaned_sales_teams.csv


### Step 5: Final Verification

In [34]:
# This is verifying the final state of the cleaned datasets
print("\n--- Final Dataframe Verification ---")
for name in ['accounts', 'products', 'sales_pipeline', 'sales_teams']:
    df = dataframes[name]
    print(f"\nDataFrame: {name}")
    print(f"Shape: {df.shape}")
    print(f"Missing values:\n{df.isnull().sum()}")
    print(f"Data types:\n{df.dtypes}")
    print(f"Duplicate rows: {df.duplicated().sum()}")


--- Final Dataframe Verification ---

DataFrame: accounts
Shape: (85, 6)
Missing values:
account             0
sector              0
year_established    0
revenue             0
employees           0
office_location     0
dtype: int64
Data types:
account              object
sector               object
year_established      int64
revenue             float64
employees             int64
office_location      object
dtype: object
Duplicate rows: 0

DataFrame: products
Shape: (7, 3)
Missing values:
product        0
series         0
sales_price    0
dtype: int64
Data types:
product        object
series         object
sales_price     int64
dtype: object
Duplicate rows: 0

DataFrame: sales_pipeline
Shape: (8800, 7)
Missing values:
sales_agent       0
product           0
account           0
deal_stage        0
engage_date     500
close_date     2089
close_value       0
dtype: int64
Data types:
sales_agent            object
product                object
account                object
deal_stage   