In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
import os

print(os.listdir("../data_directory"))

['products.csv', 'accounts.csv', 'data.md', 'sales_pipeline.csv', 'sales_teams.csv', 'data_dictionary.csv']


Read all data files:

In [11]:
df_accounts = pd.read_csv("../data_directory/accounts.csv")
df_products = pd.read_csv("../data_directory/products.csv")
df_pipeline = pd.read_csv("../data_directory/sales_pipeline.csv")
df_teams   = pd.read_csv("../data_directory/sales_teams.csv")

In [12]:
print("Accounts:")
print(df_accounts.head())
print(df_accounts.info())
print(df_accounts.isna().sum())

print("Products:")
print(df_products.head())
print(df_products.info())
print(df_products.isna().sum())


Accounts:
            account     sector  year_established  revenue  employees  \
0  Acme Corporation  technolgy              1996  1100.04       2822   
1        Betasoloin    medical              1999   251.41        495   
2          Betatech    medical              1986   647.18       1185   
3        Bioholding    medical              2012   587.34       1356   
4           Bioplex    medical              1991   326.82       1016   

  office_location subsidiary_of  
0   United States           NaN  
1   United States           NaN  
2           Kenya           NaN  
3      Philipines           NaN  
4   United States           NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   account           85 non-null     object 
 1   sector            85 non-null     object 
 2   year_established  85 non-null     int64  
 3   revenue    

For accounts, each row is a company account with features like sector, revenue, etc. There's 85 companies and 7 rows. However, subsidiary_of shows that 70 companies don't have a parent company listed.
For products, there's 7 rows/products with 3 columns. There are no missing values in the products data set.

In [13]:
# Standardize column names:
df_accounts.columns = df_accounts.columns.str.lower().str.replace(' ', '_')
df_products.columns = df_products.columns.str.lower().str.replace(' ', '_')
df_pipeline.columns = df_pipeline.columns.str.lower().str.replace(' ', '_')
df_teams.columns = df_teams.columns.str.lower().str.replace(' ', '_')

In [None]:
# Fix some typos in the accounts data set:
if "sector" in df_accounts.columns:
    df_accounts["sector"] = df_accounts["sector"].replace({"technolgy": "technology"})
if "office_location" in df_accounts.columns:
    df_accounts["office_location"] = df_accounts["office_location"].replace({"Philipines": "Philippines"})

# Drop rows missing a primary key if present (use what exists)
for key in ["account_id", "customer_id"]:
    if key in df_accounts.columns:
        df_accounts.dropna(subset=[key], inplace=True)

# Fill missing values
for col in df_accounts.select_dtypes(include="number"):
    if df_accounts[col].isna().any():
        df_accounts[col] = df_accounts[col].fillna(df_accounts[col].median())

for col in df_accounts.select_dtypes(include="object"):
    if df_accounts[col].isna().any():
        df_accounts[col] = df_accounts[col].fillna("unknown").str.strip()

# Clip outliers on numeric columns
for col in df_accounts.select_dtypes(include="number"):
    df_accounts[col] = clip_iqr(df_accounts[col], k=1.5)

# save
df_accounts.to_csv("../data_directory/accounts_clean.csv", index=False)

# quick confirmation
df_accounts.head(), df_accounts.isna().sum()

In [None]:
# Fix some typos in Accounts data set
if "sector" in df_accounts.columns:
    df_accounts["sector"] = df_accounts["sector"].replace({"technolgy": "technology"})
if "office_location" in df_accounts.columns:
    df_accounts["office_location"] = df_accounts["office_location"].replace({"Philipines": "Philippines"})

# Drop rows with missing IDs if present
for df in [df_accounts, df_products, df_pipeline, df_teams]:
    for key in ["customer_id", "account_id", "product_id", "opportunity_id", "employee_id"]: 
        if key in df.columns:
            df.dropna(subset=[key], inplace=True) # Drop rows missing primary keys

# Fill missing values
for df in [df_accounts, df_products, df_pipeline, df_teams]:
    for col in df.select_dtypes(include="number"):
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())
    for col in df.select_dtypes(include="object"):
        if df[col].isna().any():
            df[col] = df[col].fillna("unknown").str.strip() # Put "unknown" for missing categorical values

# Save all cleaned datasets
df_accounts.to_csv("../data_directory/accounts_clean.csv", index=False)
df_products.to_csv("../data_directory/products_clean.csv", index=False)
df_pipeline.to_csv("../data_directory/sales_pipeline_clean.csv", index=False)
df_teams.to_csv("../data_directory/sales_teams_clean.csv", index=False)

In [20]:
print(clean_accounts["sector"].unique())
print(clean_accounts["office_location"].unique())



['technology' 'medical' 'retail' 'software' 'entertainment' 'marketing'
 'telecommunications' 'finance' 'employment' 'services']
['United States' 'Kenya' 'Philippines' 'Japan' 'Italy' 'Norway' 'Korea'
 'Jordan' 'Brazil' 'Germany' 'Panama' 'Belgium' 'Romania' 'Poland' 'China']


Clean sales_pipeline.csv:

In [27]:

# Drop rows missing a key ID (if present)
for key in ["opportunity_id", "deal_id"]:
    if key in df_pipeline.columns:
        df_pipeline.dropna(subset=[key], inplace=True)


# convert date columns in sales_pipeline to datetime
for col in ["engage_date", "close_date"]:
    if col in df_pipeline.columns:
        df_pipeline[col] = pd.to_datetime(df_pipeline[col], errors="coerce")
        
# Convert numeric columns
for c in df_pipeline.columns:
    if any(k in c for k in ["amount", "value", "revenue", "probability"]):
        df_pipeline[c] = pd.to_numeric(df_pipeline[c], errors="coerce")

# Fill missing values
# numbers → median
for col in df_pipeline.select_dtypes(include="number"):
    if df_pipeline[col].isna().any():
        df_pipeline[col] = df_pipeline[col].fillna(df_pipeline[col].median())

# Text to "unknown"
for col in df_pipeline.select_dtypes(include="object"):
    if df_pipeline[col].isna().any():
        df_pipeline[col] = df_pipeline[col].fillna("unknown").str.strip()



df_pipeline.to_csv("../data_directory/sales_pipeline_clean.csv", index=False)


Clean sales_teams.csv:

In [28]:
# Drop rows missing a key ID if one exists
for key in ["employee_id", "rep_id", "user_id"]:
    if key in df_teams.columns:
        df_teams.dropna(subset=[key], inplace=True)

# Convert numeric columns 
for c in df_teams.columns:
    if any(k in c for k in ["quota", "target", "deals", "score"]):
        df_teams[c] = pd.to_numeric(df_teams[c], errors="coerce")

# Fill missing
for col in df_teams.select_dtypes(include="number"):
    if df_teams[col].isna().any():
        df_teams[col] = df_teams[col].fillna(df_teams[col].median())

for col in df_teams.select_dtypes(include="object"):
    if df_teams[col].isna().any():
        df_teams[col] = df_teams[col].fillna("unknown").str.strip()

df_teams.to_csv("../data_directory/sales_teams_clean.csv", index=False)


In [29]:
def check_before_after(name):
    raw = pd.read_csv(f"../data_directory/{name}.csv")
    clean = pd.read_csv(f"../data_directory/{name}_clean.csv")
    print(f"\n=== {name.upper()} ===")
    print("Rows:", len(raw), "→", len(clean))
    print("Missing before:\n", raw.isna().sum())
    print("Missing after:\n", clean.isna().sum())
    print("Columns after:", list(clean.columns))

for fname in ["accounts", "products", "sales_pipeline", "sales_teams"]:
    check_before_after(fname)



=== ACCOUNTS ===
Rows: 85 → 85
Missing before:
 account              0
sector               0
year_established     0
revenue              0
employees            0
office_location      0
subsidiary_of       70
dtype: int64
Missing after:
 account             0
sector              0
year_established    0
revenue             0
employees           0
office_location     0
subsidiary_of       0
dtype: int64
Columns after: ['account', 'sector', 'year_established', 'revenue', 'employees', 'office_location', 'subsidiary_of']

=== PRODUCTS ===
Rows: 7 → 7
Missing before:
 product        0
series         0
sales_price    0
dtype: int64
Missing after:
 product        0
series         0
sales_price    0
dtype: int64
Columns after: ['product', 'series', 'sales_price']

=== SALES_PIPELINE ===
Rows: 8800 → 8800
Missing before:
 opportunity_id       0
sales_agent          0
product              0
account           1425
deal_stage           0
engage_date        500
close_date        2089
close_value   