In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Exploring the datasets

## accounts.csv

This dataset shows which accounts are open, the sectors or fields they’re in, the year they were established for historical context, their revenue and number of employees, their office locations, and whether they’re subsidiaries of another company.

In [2]:
filename_accounts = os.path.join(os.getcwd(), "data_directory", "accounts.csv")
df_accounts = pd.read_csv(filename_accounts, header=0)

In [3]:
print("Rows: ", df_accounts.shape[0])
print("Columns: ", df_accounts.shape[1])
print("Column names: ", list(df_accounts.columns))
print("\nFirst 5 rows:")
df_accounts.head()

Rows:  85
Columns:  7
Column names:  ['account', 'sector', 'year_established', 'revenue', 'employees', 'office_location', 'subsidiary_of']

First 5 rows:


Unnamed: 0,account,sector,year_established,revenue,employees,office_location,subsidiary_of
0,Acme Corporation,technolgy,1996,1100.04,2822,United States,
1,Betasoloin,medical,1999,251.41,495,United States,
2,Betatech,medical,1986,647.18,1185,Kenya,
3,Bioholding,medical,2012,587.34,1356,Philipines,
4,Bioplex,medical,1991,326.82,1016,United States,


In [4]:
print("\nInfo: ")
df_accounts.info()


Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   account           85 non-null     object 
 1   sector            85 non-null     object 
 2   year_established  85 non-null     int64  
 3   revenue           85 non-null     float64
 4   employees         85 non-null     int64  
 5   office_location   85 non-null     object 
 6   subsidiary_of     15 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 4.8+ KB


### Missing Values in Accounts Dataframe

I noticed that the `subsidiary_of` column has mostly missing values (70 out of 85) and they are strings so they can't be replaced with a dummy variable or average. Replacing the missing values with the mean only makes sense for the numerically valued columns.

In [5]:
nan_count_accounts = df_accounts.isnull().sum()
nan_count_accounts

account              0
sector               0
year_established     0
revenue              0
employees            0
office_location      0
subsidiary_of       70
dtype: int64

In [6]:
list(df_accounts['subsidiary_of'])

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Acme Corporation',
 nan,
 nan,
 'Massive Dynamic',
 'Acme Corporation',
 nan,
 nan,
 'Bubba Gump',
 'Inity',
 nan,
 nan,
 'Acme Corporation',
 nan,
 nan,
 nan,
 'Sonron',
 nan,
 nan,
 'Golddex',
 nan,
 nan,
 nan,
 nan,
 'Sonron',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Acme Corporation',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Warephase',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Bubba Gump',
 'Inity',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Sonron',
 nan,
 'Golddex',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan]

## data_dictionary.csv

This dataset explains what all of the fields (column names) in all of the other datasets mean.

In [7]:
filename_dictionary = os.path.join(os.getcwd(), "data_directory", "data_dictionary.csv")
df_dictionary = pd.read_csv(filename_dictionary, header=0)

In [8]:
print("Rows: ", df_dictionary.shape[0])
print("Columns: ", df_dictionary.shape[1])
print("Column names: ", list(df_dictionary.columns))
print("\nAll rows:")
df_dictionary.head(21)

Rows:  21
Columns:  3
Column names:  ['Table', 'Field', 'Description']

All rows:


Unnamed: 0,Table,Field,Description
0,accounts,account,Company name
1,accounts,sector,Industry
2,accounts,year_established,Year Established
3,accounts,revenue,Annual revenue (in millions of USD)
4,accounts,employees,Number of employees
5,accounts,office_location,Headquarters
6,accounts,subsidiary_of,Parent company
7,products,product,Product name
8,products,series,Product series
9,products,sales_price,Suggested retail price


In [9]:
print("\nInfo: ")
df_dictionary.info()


Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Table        21 non-null     object
 1   Field        21 non-null     object
 2   Description  21 non-null     object
dtypes: object(3)
memory usage: 632.0+ bytes


### Missing Values in Data Dictionary Dataframe

I noticed there were no missing values in the data dictionary dataframe so it is all good to go!

In [10]:
nan_count_dictionary = df_dictionary.isnull().sum()
nan_count_dictionary

Table          0
Field          0
Description    0
dtype: int64

## products.csv

This dataset shows products and sales prices for 7 products that the sales team most likely tracks.

In [11]:
filename_products = os.path.join(os.getcwd(), "data_directory", "products.csv")
df_products = pd.read_csv(filename_products, header=0)

In [12]:
print("Rows: ", df_products.shape[0])
print("Columns: ", df_products.shape[1])
print("Column names: ", list(df_products.columns))
print("\nAll rows:")
df_products.head(7)

Rows:  7
Columns:  3
Column names:  ['product', 'series', 'sales_price']

All rows:


Unnamed: 0,product,series,sales_price
0,GTX Basic,GTX,550
1,GTX Pro,GTX,4821
2,MG Special,MG,55
3,MG Advanced,MG,3393
4,GTX Plus Pro,GTX,5482
5,GTX Plus Basic,GTX,1096
6,GTK 500,GTK,26768


In [13]:
print("\nInfo: ")
df_products.info()


Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   product      7 non-null      object
 1   series       7 non-null      object
 2   sales_price  7 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 296.0+ bytes


### Missing Values in Products Dataframe

There were also no missing values in the products dataframe so it is all good to go!

In [14]:
nan_count_products = df_products.isnull().sum()
nan_count_products

product        0
series         0
sales_price    0
dtype: int64

## sales_pipeline.csv

This dataset links sales agents to the products in `products.csv`, showing which deals were active, when they started, when they closed, the value at closing, and the account involved.

In [15]:
filename_pipeline = os.path.join(os.getcwd(), "data_directory", "sales_pipeline.csv")
df_pipeline = pd.read_csv(filename_pipeline, header=0)

In [16]:
print("Rows: ", df_pipeline.shape[0])
print("Columns: ", df_pipeline.shape[1])
print("Column names: ", list(df_pipeline.columns))
print("\nFirst 5 rows:")
df_pipeline.head()

Rows:  8800
Columns:  8
Column names:  ['opportunity_id', 'sales_agent', 'product', 'account', 'deal_stage', 'engage_date', 'close_date', 'close_value']

First 5 rows:


Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0
1,Z063OYW0,Darcel Schlecht,GTXPro,Isdom,Won,2016-10-25,2017-03-11,4514.0
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0


In [17]:
print("\nInfo: ")
df_pipeline.info()


Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8800 entries, 0 to 8799
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   opportunity_id  8800 non-null   object 
 1   sales_agent     8800 non-null   object 
 2   product         8800 non-null   object 
 3   account         7375 non-null   object 
 4   deal_stage      8800 non-null   object 
 5   engage_date     8300 non-null   object 
 6   close_date      6711 non-null   object 
 7   close_value     6711 non-null   float64
dtypes: float64(1), object(7)
memory usage: 550.1+ KB


### Missing Values in Sales Pipeline Dataframe

In [18]:
nan_count_pipeline = df_pipeline.isnull().sum()
nan_count_pipeline

opportunity_id       0
sales_agent          0
product              0
account           1425
deal_stage           0
engage_date        500
close_date        2089
close_value       2089
dtype: int64

In [19]:
nan_list_pipeline = list(df_pipeline.columns[df_pipeline.isnull().any()])
nan_list_pipeline

['account', 'engage_date', 'close_date', 'close_value']

The Sales Pipeline dataframe had a few columns with missing values: `account`, `engage_date`, `close_date`, `close_value`. These missing values appear consistently for a stretch of rows, then stop after a certain point so they don't seem random.

Out of the 8,800 rows, there are 2,089 missing values in `close_date` and `close_value`, which makes me think that these columns stop being filled at row index 6,711. Also, it looks like  the `account` column stops being filled at row index 7,375.

In [20]:
df_pipeline[nan_list_pipeline]

Unnamed: 0,account,engage_date,close_date,close_value
0,Cancity,2016-10-20,2017-03-01,1054.0
1,Isdom,2016-10-25,2017-03-11,4514.0
2,Cancity,2016-10-25,2017-03-07,50.0
3,Codehow,2016-10-25,2017-03-09,588.0
4,Hatfan,2016-10-25,2017-03-02,517.0
...,...,...,...,...
8795,,,,
8796,,,,
8797,,,,
8798,,,,


This debunks my assumption and it seems that the `close_date` column's missing values are a bit sporatic.

In [21]:
df_pipeline['close_date'][8100:8150]

8100           NaN
8101    2017-12-08
8102    2017-11-25
8103    2017-12-08
8104    2017-12-21
8105    2017-12-10
8106    2017-11-24
8107    2017-11-25
8108    2017-12-22
8109    2017-11-27
8110    2017-12-29
8111    2017-12-01
8112    2017-11-26
8113    2017-12-07
8114    2017-12-06
8115    2017-11-27
8116    2017-12-06
8117    2017-12-05
8118           NaN
8119    2017-12-02
8120           NaN
8121    2017-12-08
8122    2017-11-25
8123    2017-11-28
8124    2017-12-07
8125           NaN
8126           NaN
8127    2017-12-05
8128    2017-12-09
8129    2017-12-04
8130    2017-12-18
8131    2017-12-11
8132    2017-12-05
8133           NaN
8134    2017-11-29
8135    2017-12-05
8136    2017-11-29
8137    2017-12-28
8138    2017-12-10
8139    2017-12-11
8140    2017-12-10
8141    2017-11-29
8142    2017-12-05
8143    2017-11-29
8144    2017-11-29
8145    2017-12-07
8146    2017-12-10
8147    2017-12-14
8148    2017-12-11
8149    2017-12-07
Name: close_date, dtype: object

I'll replace the nan values in the `close_value` column with the mean close value.

In [22]:
mean_close_value = df_pipeline['close_value'].mean()

df_pipeline['close_value'].fillna(value =mean_close_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_pipeline['close_value'].fillna(value =mean_close_value, inplace=True)


Checking to make sure there are now 0 missing values in `df_pipeline['close_value']`.

In [23]:
df_pipeline['close_value'].isnull().sum()

np.int64(0)

## sales_teams.csv

In [24]:
filename_teams = os.path.join(os.getcwd(), "data_directory", "sales_teams.csv")
df_teams = pd.read_csv(filename_teams, header=0)

In [25]:
print("Rows: ", df_teams.shape[0])
print("Columns: ", df_teams.shape[1])
print("Column names: ", list(df_teams.columns))
print("\nFirst 5 rows:")
df_teams.head()

Rows:  35
Columns:  3
Column names:  ['sales_agent', 'manager', 'regional_office']

First 5 rows:


Unnamed: 0,sales_agent,manager,regional_office
0,Anna Snelling,Dustin Brinkmann,Central
1,Cecily Lampkin,Dustin Brinkmann,Central
2,Versie Hillebrand,Dustin Brinkmann,Central
3,Lajuana Vencill,Dustin Brinkmann,Central
4,Moses Frase,Dustin Brinkmann,Central


In [26]:
print("\nInfo: ")
df_teams.info()


Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sales_agent      35 non-null     object
 1   manager          35 non-null     object
 2   regional_office  35 non-null     object
dtypes: object(3)
memory usage: 968.0+ bytes


### Missing Values in Sales Teams Dataframe

No missing values so we're good to go!

In [27]:
nan_count_teams = df_teams.isnull().sum()
nan_count_teams

sales_agent        0
manager            0
regional_office    0
dtype: int64

# All Dataframes and their Missing/Duplicate Percentages

In [28]:

# All the dataframes in a dictionary for easy iteration
dfs_nan = {
    "dictionary": (df_dictionary, nan_count_dictionary),
    "products": (df_products, nan_count_products),
    "pipeline": (df_pipeline, nan_count_pipeline),
    "teams": (df_teams, nan_count_teams),
}

# Looping through each dataframe
for name, (df, nan_count) in dfs_nan.items():
    print(f"\n--- {name} dataframe ---")
    
    # Missing percentages
    missing_percent = (nan_count / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': nan_count,
        'Missing Percentage': missing_percent
    })
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Duplicates
    print("\nDuplicate rows:", df.duplicated().sum())



--- dictionary dataframe ---
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []

Duplicate rows: 0

--- products dataframe ---
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []

Duplicate rows: 0

--- pipeline dataframe ---
             Missing Count  Missing Percentage
account               1425           16.193182
engage_date            500            5.681818
close_date            2089           23.738636
close_value           2089           23.738636

Duplicate rows: 0

--- teams dataframe ---
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []

Duplicate rows: 0
