# Data Understanding & Data Preparation

### Import Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

client = 'TOOL_CLIENT.csv'
sales  = 'TOOL_SALES.csv'
joint = 'TOOL_JOINT.csv'

In [2]:
df_client = pd.read_csv(client)
df_sales = pd.read_csv(sales)

df = pd.merge(df_client, df_sales, on='CLIENT_ID')
df.head()

  df_sales = pd.read_csv(sales)


Unnamed: 0,CLIENT_ID,CLIENT_CREATE DATE,REGION,TRADE SECTOR,N_EMPLOYEES,ECONOMIC_POT,ECO_POT_CLASS,RISK_CAT,YYYYMM,ITEM_ID,FLG_TOOL,SALES_CHANNEL,NET,UNIT,FAMILY_CODE,GROUP_CODE,CANCELLED
0,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,201709,12585,0,C,937.94,P,XBXV2EB,XBXV2EB0102,
1,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,201709,13431,0,C,641.09,P,XBXV2AB,XBXV2AB0102,
2,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,201709,1380,0,C,0.0,P,XBXV2EB,XBXV2EB0102,
3,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,202010,3493,1,C,1341.14,P,XBXV1AF,XBXV1AF0201,
4,939,2005-11-15 00:00:00,LE,15500,2,681.26,E,3d,201705,8241,0,B,64.51,P,XAES4AA,XAES4AA0101,X


In [3]:
# lower case and replace spaces with underscores in column names
original_columns = df.columns
renamed_columns = [col.lower().replace(" ", "_") for col in original_columns]
column_mapping = dict(zip(original_columns, renamed_columns))
df.rename(columns=column_mapping, inplace=True)

df.head()

Unnamed: 0,client_id,client_create_date,region,trade_sector,n_employees,economic_pot,eco_pot_class,risk_cat,yyyymm,item_id,flg_tool,sales_channel,net,unit,family_code,group_code,cancelled
0,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,201709,12585,0,C,937.94,P,XBXV2EB,XBXV2EB0102,
1,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,201709,13431,0,C,641.09,P,XBXV2AB,XBXV2AB0102,
2,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,201709,1380,0,C,0.0,P,XBXV2EB,XBXV2EB0102,
3,9306,2005-11-15 00:00:00,BZ,11000,6,8659.81,D,3d,202010,3493,1,C,1341.14,P,XBXV1AF,XBXV1AF0201,
4,939,2005-11-15 00:00:00,LE,15500,2,681.26,E,3d,201705,8241,0,B,64.51,P,XAES4AA,XAES4AA0101,X


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2050449 entries, 0 to 2050448
Data columns (total 17 columns):
 #   Column              Dtype  
---  ------              -----  
 0   client_id           int64  
 1   client_create_date  object 
 2   region              object 
 3   trade_sector        int64  
 4   n_employees         int64  
 5   economic_pot        float64
 6   eco_pot_class       object 
 7   risk_cat            object 
 8   yyyymm              int64  
 9   item_id             int64  
 10  flg_tool            int64  
 11  sales_channel       object 
 12  net                 float64
 13  unit                object 
 14  family_code         object 
 15  group_code          object 
 16  cancelled           object 
dtypes: float64(2), int64(6), object(9)
memory usage: 265.9+ MB


In [5]:
# converting date columns to datetime
df['client_create_date'] = pd.to_datetime(df['client_create_date'])
df['yyyymm'] = pd.to_datetime(df['yyyymm'].astype(str), format='%Y%m')

# converting other columns to appropriate data types
df['client_id'] = df['client_id'].astype(str)
df['cancelled'] = df['cancelled'] == 'X'
df['unit'] = df['unit'] == 'P'

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2050449 entries, 0 to 2050448
Data columns (total 17 columns):
 #   Column              Dtype         
---  ------              -----         
 0   client_id           object        
 1   client_create_date  datetime64[ns]
 2   region              object        
 3   trade_sector        int64         
 4   n_employees         int64         
 5   economic_pot        float64       
 6   eco_pot_class       object        
 7   risk_cat            object        
 8   yyyymm              datetime64[ns]
 9   item_id             int64         
 10  flg_tool            int64         
 11  sales_channel       object        
 12  net                 float64       
 13  unit                bool          
 14  family_code         object        
 15  group_code          object        
 16  cancelled           bool          
dtypes: bool(2), datetime64[ns](2), float64(2), int64(4), object(7)
memory usage: 238.6+ MB


In [7]:
# creating a new column for identifie the unique sales
df['sales_id'] = df['client_id'].astype(str) + '_' + df['yyyymm'].dt.strftime('%Y%m')

df.head()

Unnamed: 0,client_id,client_create_date,region,trade_sector,n_employees,economic_pot,eco_pot_class,risk_cat,yyyymm,item_id,flg_tool,sales_channel,net,unit,family_code,group_code,cancelled,sales_id
0,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,12585,0,C,937.94,True,XBXV2EB,XBXV2EB0102,False,9306_201709
1,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,13431,0,C,641.09,True,XBXV2AB,XBXV2AB0102,False,9306_201709
2,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,1380,0,C,0.0,True,XBXV2EB,XBXV2EB0102,False,9306_201709
3,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2020-10-01,3493,1,C,1341.14,True,XBXV1AF,XBXV1AF0201,False,9306_202010
4,939,2005-11-15,LE,15500,2,681.26,E,3d,2017-05-01,8241,0,B,64.51,True,XAES4AA,XAES4AA0101,True,939_201705


In [8]:
# adding new variables to the main dataframe
n_purchases = df.groupby('client_id')['sales_id'].nunique()   # counting unique sales for each client
sales_net = df.groupby('sales_id')['net'].sum()   # summarizing the net sales by sales_id

time_diff = df[['client_id', 'sales_id', 'yyyymm']].drop_duplicates()   # extracting time between purchases
time_diff['time_diff'] = time_diff.groupby('client_id')['yyyymm'].diff().dt.days

# merging the new variables to the main dataframe
df['n_purchases'] = df['client_id'].map(n_purchases)   # adding the number of purchases
df['sales_net'] = df['sales_id'].map(sales_net)   # adding the net sales
df['time_diff'] = df['sales_id'].map(time_diff.set_index('sales_id')['time_diff'])   # adding the time difference

df.head()

Unnamed: 0,client_id,client_create_date,region,trade_sector,n_employees,economic_pot,eco_pot_class,risk_cat,yyyymm,item_id,...,sales_channel,net,unit,family_code,group_code,cancelled,sales_id,n_purchases,sales_net,time_diff
0,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,12585,...,C,937.94,True,XBXV2EB,XBXV2EB0102,False,9306_201709,2,1579.03,
1,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,13431,...,C,641.09,True,XBXV2AB,XBXV2AB0102,False,9306_201709,2,1579.03,
2,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,1380,...,C,0.0,True,XBXV2EB,XBXV2EB0102,False,9306_201709,2,1579.03,
3,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2020-10-01,3493,...,C,1341.14,True,XBXV1AF,XBXV1AF0201,False,9306_202010,2,1341.14,1126.0
4,939,2005-11-15,LE,15500,2,681.26,E,3d,2017-05-01,8241,...,B,64.51,True,XAES4AA,XAES4AA0101,True,939_201705,3,64.51,


In [9]:
df['unit'].value_counts()

unit
True     2035647
False      14802
Name: count, dtype: int64

In [None]:
df_model = df.copy()   # creating a copy of the main dataframe for modeling

# dropping columns that are not needed for modeling
df_model.drop(df_model[df_model['unit'] == False].index, inplace=True)   # dropping the canceled orders
df_model.drop('unit', axis=1, inplace=True)    # dropping the canceled column
df_model.drop(df_model[df_model['cancelled'] == True].index, inplace=True)   # dropping the canceled orders
df_model.drop('cancelled', axis=1, inplace=True)    # dropping the canceled column

df_model.drop(df_model[df_model['time_diff'] < 730].index, inplace=True)  # dropping the sales with less than 2 years between them

df_model.head()

Unnamed: 0,client_id,client_create_date,region,trade_sector,n_employees,economic_pot,eco_pot_class,risk_cat,yyyymm,item_id,flg_tool,sales_channel,net,family_code,group_code,sales_id,n_purchases,sales_net,time_diff
0,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,12585,0,C,937.94,XBXV2EB,XBXV2EB0102,9306_201709,2,1579.03,
1,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,13431,0,C,641.09,XBXV2AB,XBXV2AB0102,9306_201709,2,1579.03,
2,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,1380,0,C,0.0,XBXV2EB,XBXV2EB0102,9306_201709,2,1579.03,
3,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2020-10-01,3493,1,C,1341.14,XBXV1AF,XBXV1AF0201,9306_202010,2,1341.14,1126.0
7,8321,2005-11-15,LE,15500,2,681.26,E,T8,2018-04-01,6776,0,B,20.02,XBSI4AA,XBSI4AA0101,8321_201804,7,20.02,


In [14]:
# creating the target variable
df_model['target'] = np.where(df_model['n_purchases'] > 1, 1, 0)
df_model.head()

Unnamed: 0,client_id,client_create_date,region,trade_sector,n_employees,economic_pot,eco_pot_class,risk_cat,yyyymm,item_id,flg_tool,sales_channel,net,family_code,group_code,sales_id,n_purchases,sales_net,time_diff,target
0,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,12585,0,C,937.94,XBXV2EB,XBXV2EB0102,9306_201709,2,1579.03,,1
1,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,13431,0,C,641.09,XBXV2AB,XBXV2AB0102,9306_201709,2,1579.03,,1
2,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2017-09-01,1380,0,C,0.0,XBXV2EB,XBXV2EB0102,9306_201709,2,1579.03,,1
3,9306,2005-11-15,BZ,11000,6,8659.81,D,3d,2020-10-01,3493,1,C,1341.14,XBXV1AF,XBXV1AF0201,9306_202010,2,1341.14,1126.0,1
7,8321,2005-11-15,LE,15500,2,681.26,E,T8,2018-04-01,6776,0,B,20.02,XBSI4AA,XBSI4AA0101,8321_201804,7,20.02,,1


In [16]:
df_model.to_csv('DF_Model.csv', index=False)   # saving the model dataframe to a csv file