In [None]:
import pandas as pd
from clickhouse_driver import Client
import numpy as np
from statistics import mean, stdev
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support, f1_score, confusion_matrix, ConfusionMatrixDisplay, SCORERS, classification_report
from xgboost import XGBClassifier
import featuretools as ft
import time
import yaml
from datetime import datetime, timedelta
import statistics

## Basic variables

In [None]:
CH_CONFIG_PATH = 'config.txt' # local path to ClickHouse DB credentials

client = Client(
  **yaml.load(
    open(CH_CONFIG_PATH, 'r'), 
    Loader=yaml.SafeLoader
  )['database']
)

In [None]:
base_query = 'select user_id from schema.table1 group by user_id'
threshold_date = '2008-01-01'

## Data from production model

In [None]:
df = pd.read_parquet("../../shared/my_file.parquet")

In [None]:
main_table = client.query_dataframe("""select *
                                    from schema.table as c 
                                    where c.user_id is not null 
                                        and toDate(c.updated) <= '{}'
                                        """
                                     .format(threshold_date))

comment_ids.info()

In [None]:
df = df.merge(main_table, how='left', on='id')

## Load data

In [None]:
orders = client.query_dataframe("""select *
                                    from schema.table as o 
                                    ANY LEFT JOIN schema2.dictionary as d ON d.status=o.status 
                                    where o.date between '{}' and '{}'
                                        and o.user_id in ({})"""
.format(threshold_date, threshold_date2, base_query))

Drop incorrect values and convert features to numeric.

In [None]:
orders["sum"] = pd.to_numeric(orders["sum"], errors='coerce')

## Feature matrix

In [None]:
users = orders.groupby([ "user_id"] ).size().to_frame(name = 'count').reset_index()

In [None]:
es = ft.EntitySet(id = 'extenal_id')
es

In [None]:
es = es.add_dataframe(
    dataframe_name="orders",
    dataframe=table2,
    index="id",
)

In [None]:
es = es.add_dataframe(
    dataframe_name="users",
    dataframe=users,
    index="user_id",
)

In [None]:
es = es.add_relationship("users", "user_id", "orders", "id")

In [None]:
tic = time.time()
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name="users", n_jobs=3, ignore_columns={'users':['count'] }, agg_primitives=['sum', 'std', 'max', 'skew', 'min', 'mean', 'count'])
toc = time.time()

feature_matrix

In [None]:
print('Processing time:', toc-tic)

In [None]:
ft_df = feature_matrix.reset_index()
ft_df.head()

In [None]:
for col in ( 'COUNT(orders)',
 'MAX(orders.order_sum)',
 'MEAN(orders.order_sum)',
 'MIN(orders.order_sum)',
 'SKEW(orders.order_sum)',
 'STD(orders.order_sum)',
 'SUM(orders.order_sum)'):
    df2[col] = np.where(df2['first_order_created_date'].dt.date < df2['comment_created'].dt.date, df2[col], -1)

Replace missing values with -1

In [None]:
new_cols = [
 'COUNT(orders)',
 'MAX(orders.order_sum)']

In [None]:
for col in new_cols:
    df2[col] = df2[col].fillna(-1)

Percent of missing values

In [None]:
for col in new_cols:
    print(df2[[col]].isna().sum()/df2.shape[0])

In [None]:
df2 = df2.astype({'COUNT(orders)':'int'})

In [None]:
neg, pos = np.bincount(df2['is_spam'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

## Split data

In [None]:
test_size = 50000

In [None]:
target_names = ['Non_spam', 'Spam']

## Experiment 1 
test on last records by date

In [None]:
n = df2.sort_values(
    by=['created_date'], ascending=False)

test = n.head(test_size)
train = n.tail(n.shape[0]-test_size)

In [None]:
X_train = train.drop(['date', 'target'], axis=1)
y_train = train[['target']]
X_test = test.drop(['date', 'target'], axis=1)
y_test = test[['target']]

In [None]:
X_train_ = X_train.drop(new_cols, axis=1)
X_test_ = X_test.drop(new_cols, axis=1)

Train without new features

In [None]:
model = XGBClassifier(use_label_encoder=False, seed=1)
model.fit(X_train_,np.ravel(y_train))
y_pred = model.predict(X_test_)
print(classification_report(np.ravel(y_test), y_pred, digits=6, target_names=target_names))

Train with new features

In [None]:
model = XGBClassifier(use_label_encoder=False, seed=1)
model.fit(X_train,np.ravel(y_train))
y_pred = model.predict(X_test)
print(classification_report(np.ravel(y_test), y_pred, digits=6, target_names=target_names))

Feature importance in model with orders

In [None]:
from xgboost import plot_importance
fig, ax = plt.subplots(figsize=(1,20))
plot_importance(model, ax=ax)

## Experiment 2 
- test on random sample.

In [None]:
test = df2.sample(n = test_size)
train = df2.drop(test.index)

In [None]:
X_train = train.drop(['created_date', 'target'], axis=1)
y_train = train[['target']]
X_test = test.drop(['created_date', 'target'], axis=1)
y_test = test[['target']]

In [None]:
X_train_ = X_train.drop(new_cols, axis=1)
X_test_ = X_test.drop(new_cols, axis=1)

Train without new features

In [None]:
model = XGBClassifier(use_label_encoder=False, seed=1)
model.fit(X_train_,np.ravel(y_train))
y_pred = model.predict(X_test_)
print(classification_report(np.ravel(y_test), y_pred, digits=6, target_names=target_names))

Train with new features

In [None]:
model = XGBClassifier(use_label_encoder=False, seed=1)
model.fit(X_train,np.ravel(y_train))
y_pred = model.predict(X_test)
print(classification_report(np.ravel(y_test), y_pred, digits=6, target_names=target_names))