Getting Started | Data Prep | **Data Exploration** | Preprocessing | Model Tuning | Final Model

In [22]:
# Import libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import pickle
from __future__ import division

%matplotlib inline
#plt.style.use('fivethirtyeight')

In [23]:
# adding the modules directory to path
import sys
sys.path.insert(0, '../modules')

# reading in functions
from helpers import read_in_dataset, get_percent

## Read in Training Data 

In [3]:
#read pickle files and save to dataframe
sales_data_train = pd.read_csv('../data/processed/sales_data_train.csv')

In [4]:
#Viewing first 5 rows of training data
sales_data_train.head()

Unnamed: 0,Opportunity Number,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Opportunity Result,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Competitor Type,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
0,6748356,Batteries & Accessories,Car Accessories,Northwest,Fields Sales,91,Loss,2,3,3,20000,4,4,0,Unknown,1.0,0.0,0.0,2
1,9608239,Exterior Accessories,Car Accessories,Northeast,Reseller,12,Won,2,5,5,8000,1,1,0,,0.0,1.0,0.0,1
2,8139132,Garage & Car Care,Car Accessories,Pacific,Reseller,35,Loss,3,7,7,9300,1,1,0,Unknown,0.0,1.0,0.0,1
3,8423006,Motorcycle Parts,Performance & Non-auto,Northwest,Reseller,7,Loss,7,20,20,26315,1,1,0,Unknown,0.0,0.172589,0.827411,3
4,8104406,Motorcycle Parts,Performance & Non-auto,Midwest,Reseller,26,Loss,2,17,17,9202,1,1,0,Unknown,0.0,1.0,0.0,1


In [5]:
#Checking Shape and Data Types of Traing Data 
sales_data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54617 entries, 0 to 54616
Data columns (total 19 columns):
Opportunity Number                         54617 non-null int64
Supplies Subgroup                          54617 non-null object
Supplies Group                             54617 non-null object
Region                                     54617 non-null object
Route To Market                            54617 non-null object
Elapsed Days In Sales Stage                54617 non-null int64
Opportunity Result                         54617 non-null object
Sales Stage Change Count                   54617 non-null int64
Total Days Identified Through Closing      54617 non-null int64
Total Days Identified Through Qualified    54617 non-null int64
Opportunity Amount USD                     54617 non-null int64
Client Size By Revenue                     54617 non-null int64
Client Size By Employee Count              54617 non-null int64
Revenue From Client Past Two Years         54617 non-n

## Check for Duplicates

In [6]:
# Is the opportunity ID unique?
sales_data_train['Opportunity Number'].nunique() == len(sales_data_train)

False

In [7]:
# View opportunities that may have multiple records
multiple_records = sales_data_train[sales_data_train.duplicated(subset='Opportunity Number', keep=False)].sort_values('Opportunity Number')
one_record = sales_data_train[sales_data_train.groupby('Opportunity Number')['Opportunity Number'].transform('count') ==1]
multiple_records.head()
# Some records seem to differ by subgroup, while other seem to be sublicate records

Unnamed: 0,Opportunity Number,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Opportunity Result,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Competitor Type,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
29326,1724032,Car Electronics,Car Electronics,Southwest,Fields Sales,210,Won,5,94,90,469000,1,1,0,,0.073248,0.839703,0.04034,6
37187,1724032,Car Electronics,Car Electronics,Southwest,Fields Sales,210,Won,5,94,90,469000,1,1,0,,0.073248,0.839703,0.04034,6
37087,4787647,Batteries & Accessories,Car Accessories,Southeast,Fields Sales,26,Won,6,49,46,0,1,4,0,Unknown,0.431174,0.293522,0.210526,1
25553,4787647,Shelters & RV,Performance & Non-auto,Southeast,Fields Sales,26,Won,6,49,46,0,1,4,0,Unknown,0.431174,0.293522,0.210526,1
47134,4787647,Interior Accessories,Car Accessories,Southeast,Fields Sales,26,Won,6,49,46,6000,1,4,0,Unknown,0.431174,0.293522,0.210526,1


In [8]:
# Check that data was split correctly
assert len(sales_data_train) == len(multiple_records) + len(one_record)

# View opportunities that share an opportunity number,but are not completely that same
duplicates = sales_data_train[sales_data_train.duplicated(keep=False)]
same_opp_nums_not_dup = multiple_records[~multiple_records['Opportunity Number'].isin(duplicates['Opportunity Number'])]

same_opp_nums_not_dup.head()

Unnamed: 0,Opportunity Number,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Opportunity Result,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Competitor Type,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
37087,4787647,Batteries & Accessories,Car Accessories,Southeast,Fields Sales,26,Won,6,49,46,0,1,4,0,Unknown,0.431174,0.293522,0.210526,1
25553,4787647,Shelters & RV,Performance & Non-auto,Southeast,Fields Sales,26,Won,6,49,46,0,1,4,0,Unknown,0.431174,0.293522,0.210526,1
47134,4787647,Interior Accessories,Car Accessories,Southeast,Fields Sales,26,Won,6,49,46,6000,1,4,0,Unknown,0.431174,0.293522,0.210526,1
54219,4947042,Towing & Hitches,Car Accessories,Midwest,Fields Sales,88,Loss,4,81,81,200000,3,5,3,Known,0.170972,0.03075,0.798278,5
27541,4947042,Exterior Accessories,Car Accessories,Midwest,Fields Sales,88,Loss,4,81,81,600000,3,5,3,Known,0.170972,0.03075,0.798278,7


It appears as if some of the opportunites have multiple records for the training data. These records same the same opportunity number but different in terms of...
* Supplies Subgroup
* Supplies Group
* Opportunity Amount USD

Therefore, we are going to start with a simple solution and take a random record for those with multiple records. 

In [9]:
# Reduce opportunities with multiple records down to 1
reduced_sales_data_train = multiple_records.sample(frac=1, random_state=0).groupby('Opportunity Number').head(1)
reduced_sales_data_train = pd.concat([one_record, reduced_sales_data_train])

In [10]:
# Drop duplicates
reduced_sales_data_train = reduced_sales_data_train.drop_duplicates()

In [11]:
#Check to make sure Opportunity Numbers in reduced set is the same as original dataset
assert set(reduced_sales_data_train['Opportunity Number']) == set(sales_data_train['Opportunity Number'])

# pickle dataframe
# pd.to_pickle(reduced_sales_data_train, '../data/processed/reduced_sales_data_train.pickle')

## Check Out Customer Revenue (New vs. Returning Customers)

For simplicity, let's define the following terms:
* *new customer* - Client Revenue (past 2 years) is \$0 USD  
* *returning customer* - Client Revenue (past 2 years) greater than \$0 USD

### Number of 'New' versus 'Returning' Customers

In [12]:
#Seperate new versus returning customers
new_customers = reduced_sales_data_train[reduced_sales_data_train['Revenue From Client Past Two Years'] == 0]
returning_customers = reduced_sales_data_train[reduced_sales_data_train['Revenue From Client Past Two Years'] > 0]
df_len = len(reduced_sales_data_train)

print('Number of New Customers: {}'.format(len(new_customers)))
print('Number of Returning Customers: {}'.format(len(returning_customers)))

#check to make sure data is split correctly
assert len(reduced_sales_data_train) == len(new_customers) + len(returning_customers)

Number of New Customers: 48374
Number of Returning Customers: 6145


## Close Rate (New vs. Returning Customers)
> *New Customer Close Rate* = % of deals won out of all new customers  
> *Returning Customer Close Rate* = % of deals won out of all returning customers

In [32]:
close_rate = get_percent(reduced_sales_data_train[reduced_sales_data_train['Opportunity Result'] == 'Won'], reduced_sales_data_train)
new_customer_close = get_percent(new_customers[new_customers['Opportunity Result'] == 'Won'], new_customers)
return_customer_close = get_percent(returning_customers[returning_customers['Opportunity Result'] == 'Won'], returning_customers)

In [33]:
print('Close Rate for Entire Training Dataset: {}%'.format(close_rate))
print('New Customers Close Rate: {}%'.format(new_customer_close))
print('Returning Customers Close Rate: {}%'.format(return_customer_close))

Close Rate for Entire Training Dataset: 22.5%
New Customers Close Rate: 17.3%
Returning Customers Close Rate: 63.2%


The close rate (% of won deals) for returning customers (those who have made a purchase in the past 2 years) is much higher than those who are new customers (no purchase made in the past 2 years). 

We may looks at how our model preforms with and without segmenting this data. 

## Explore Null Values

In [36]:
reduced_sales_data_train.isnull().sum()

Opportunity Number                         0
Supplies Subgroup                          0
Supplies Group                             0
Region                                     0
Route To Market                            0
Elapsed Days In Sales Stage                0
Opportunity Result                         0
Sales Stage Change Count                   0
Total Days Identified Through Closing      0
Total Days Identified Through Qualified    0
Opportunity Amount USD                     0
Client Size By Revenue                     0
Client Size By Employee Count              0
Revenue From Client Past Two Years         0
Competitor Type                            0
Ratio Days Identified To Total Days        0
Ratio Days Validated To Total Days         0
Ratio Days Qualified To Total Days         0
Deal Size Category                         0
dtype: int64

## Explore Unique Values

In [39]:
reduced_sales_data_train.nunique().sort_values(ascending=False).head(10)

Opportunity Number                         54519
Ratio Days Validated To Total Days         10791
Ratio Days Qualified To Total Days          7697
Ratio Days Identified To Total Days         7504
Opportunity Amount USD                      7422
Total Days Identified Through Closing        150
Total Days Identified Through Qualified      149
Elapsed Days In Sales Stage                  134
Sales Stage Change Count                      20
Supplies Subgroup                             11
dtype: int64

Discrete Variables with Higher Cardinality to Consider:
* Supplies Subgroup

In [40]:
reduced_sales_data_train.nunique().sort_values(ascending=True).head(10)

Opportunity Result                     2
Competitor Type                        3
Supplies Group                         4
Route To Market                        5
Revenue From Client Past Two Years     5
Client Size By Employee Count          5
Client Size By Revenue                 5
Deal Size Category                     7
Region                                 7
Supplies Subgroup                     11
dtype: int64

## Inspect Target Variable - Won/Loss Deal

In [51]:
# Percent of opportunities that are Won versus Lost 
pd.DataFrame(reduced_sales_data_train.groupby('Opportunity Result')['Opportunity Number'].count()/len(reduced_sales_data_train)*100).rename(
    columns={'Opportunity Number': 'Percent of Training Data'})

Unnamed: 0_level_0,Percent of Training Data
Opportunity Result,Unnamed: 1_level_1
Loss,77.50509
Won,22.49491
