In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append('..')
sys.path.append('../src')

import pandas as pd
import numpy as np

from functools import partial
import constants as cst
from loading import Loader
from preprocessing import Preprocessor
import seaborn as sns

In [None]:
data_loader = Loader()
data = data_loader.load_data()

In [None]:
preprocessor = Preprocessor()
preprocessor.preprocess(data)

# Loading data

In [None]:
data_loader = Loader(sample=1_000_000)
data = data_loader.load_data('../data/inputs/transactions_dataset.csv')

In [None]:
data.head()

# Data Quality checks

## Quantity

In [None]:
(data['quantity'] <= 0).sum()

In [None]:
np.log(data['quantity']).plot(kind='hist', bins=100)

## Sales Net

In [None]:
(data['sales_net'] <= 0).sum()

In [None]:
data['sales_net'].plot(kind='hist', bins=100)

## Dates

In [None]:
invoices_date = pd.to_datetime(data['date_invoice'])
print(max(invoices_date))
print(min(invoices_date))

In [None]:
orders_date = pd.to_datetime(data['date_order'])
print(max(orders_date))
print(min(orders_date))

## Order Channel

In [None]:
data['order_channel'].unique()

# Sales Cycle Analysis

In [None]:
def avg_date_diff(x):
    # Integer number of Days
    return pd.Timedelta(np.diff(x).mean()).total_seconds()/(60*60*24)


In [None]:
data['date_order'] = pd.to_datetime(data['date_order'])

In [None]:
data = data.sort_values('date_order')
test = data.groupby(by=['client_id'])['date_order'].agg(['size', 'min', 'max', avg_date_diff]).reset_index()

In [None]:
test.loc[(test['avg_date_diff'] >= 0) & (test['avg_date_diff'] < 10), 'frequency_category'] = 'freq'
test.loc[(test['avg_date_diff'] >= 10) & (test['avg_date_diff'] < 35), 'frequency_category'] = 'med'
test.loc[(test['avg_date_diff'] >= 35) | (test['avg_date_diff'] == 0), 'frequency_category'] = 'infreq'

In [None]:
test.to_csv('intermediate_step.csv')

In [None]:
test = pd.read_csv('intermediate_step.csv', index_col=0)

In [None]:
test.loc[test['frequency_category']!='infreq', :]

In [None]:
sns.histplot(test.loc[test['frequency_category']!='infreq', :], x='avg_date_diff', hue='frequency_category')

In [None]:
from datetime import datetime, timedelta
datetime.strptime('2019-09-22', '%Y-%m-%d') - timedelta(20)

In [None]:
from datetime import datetime
test['is_churn'] = 0
test.loc[(test['frequency_category'] == 'freq') & (test['max'] < datetime.strptime('2019-09-22', '%Y-%m-%d') - timedelta(40)), 'is_churn'] = 1
test.loc[(test['frequency_category'] == 'med') & (test['max'] < datetime.strptime('2019-09-22', '%Y-%m-%d') - timedelta(100)), 'is_churn'] = 1
test.loc[(test['frequency_category'] == 'infreq') & (test['max'] < datetime.strptime('2019-09-22', '%Y-%m-%d') - timedelta(400)), 'is_churn'] = 1

In [None]:
print((test['is_churn'] == 1).sum())
print((test['is_churn'] == 1).sum()/170589)