# Data Understanding & Data Preparation

### Import Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

client = 'TOOL_CLIENT.csv'
sales  = 'TOOL_SALES.csv'
joint = 'TOOL_JOINT.csv'

In [None]:
df_client = pd.read_csv(client)
df_sales = pd.read_csv(sales)

df = pd.merge(df_client, df_sales, on='CLIENT_ID')
df.head()

In [None]:
# lower case and replace spaces with underscores in column names
original_columns = df.columns
renamed_columns = [col.lower().replace(" ", "_") for col in original_columns]
column_mapping = dict(zip(original_columns, renamed_columns))
df.rename(columns=column_mapping, inplace=True)

df.head()

In [None]:
df.info()

In [None]:
# converting date columns to datetime
df['client_create_date'] = pd.to_datetime(df['client_create_date'])
df['yyyymm'] = pd.to_datetime(df['yyyymm'].astype(str), format='%Y%m')

# converting other columns to appropriate data types
df['client_id'] = df['client_id'].astype(str)
df['cancelled'] = df['cancelled'] == 'X'
df['unit'] = df['unit'] == 'P'

In [None]:
df.info()

In [None]:
# creating a new column for identifie the unique sales
df['sales_id'] = df['client_id'].astype(str) + '_' + df['yyyymm'].dt.strftime('%Y%m')

In [None]:
# adding new variables to the main dataframe
n_purchases = df.groupby('client_id')['sales_id'].nunique()   # counting unique sales for each client
sales_net = df.groupby('sales_id')['net'].sum()   # summarizing the net sales by sales_id

time_diff = df[['client_id', 'sales_id', 'yyyymm']].drop_duplicates()   # extracting time between purchases
time_diff['time_diff_prec'] = time_diff.groupby('client_id')['yyyymm'].diff().dt.days
time_diff['time_diff_next'] = (time_diff.groupby('client_id')['yyyymm'].shift(-1) - time_diff['yyyymm']).dt.days

# merging the new variables to the main dataframe
df['n_purchases'] = df['client_id'].map(n_purchases)   # adding the number of purchases
df['sales_net'] = df['sales_id'].map(sales_net)   # adding the net sales
df['time_diff_prec'] = df['sales_id'].map(time_diff.set_index('sales_id')['time_diff_prec'])   # adding the time difference
df['time_diff_next'] = df['sales_id'].map(time_diff.set_index('sales_id')['time_diff_next'])

df.head()

In [None]:
df['unit'].value_counts()

In [None]:
df_model = df.copy()   # creating a copy of the main dataframe for modeling

# dropping columns that are not needed for modeling
df_model.drop(df_model[df_model['unit'] == False].index, inplace=True)   # dropping the canceled orders
df_model.drop('unit', axis=1, inplace=True)    # dropping the canceled column
df_model.drop(df_model[df_model['cancelled'] == True].index, inplace=True)   # dropping the canceled orders
df_model.drop('cancelled', axis=1, inplace=True)    # dropping the canceled column

df_model.head(10)

In [None]:
# creating the target variable
df_model['target'] = np.where(df_model['time_diff_next'] > 730, 1, 0)
df_model.head(20)

In [None]:
df_model.info()

In [None]:
df_model.drop(df_model[df_model['time_diff_next'] <= 730].index, inplace=True)  # dropping the sales with less than 2 years between them
df_model.head(20)

In [None]:
df_model.to_csv('DF_Model.csv', index=False)   # saving the model dataframe to a csv file