In [1]:
import sys
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy

from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from tsfresh import extract_features
import tsfresh as tf
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import feature_selection as fs

from yellowbrick.features import pca,  pca_decomposition, manifold_embedding
from yellowbrick import features as feat

In [2]:
import sys
sys.path.append('/Users/ivan_zorin/Documents/AIRI/code/twino/')

from research_src.data import *

In [3]:
PATH = '/Users/ivan_zorin/Documents/AIRI/data/sgcc/data.csv'
df = get_dataset(PATH)

# X, labels = get_XY_data(df)

In [4]:
labels = df['FLAG']
y = labels.to_numpy()

data = df.drop('FLAG', axis=1).reset_index().melt(id_vars=['CONS_NO'], var_name='date', value_name='cons').fillna(0)

In [10]:
features_path = '/Users/ivan_zorin/Documents/AIRI/data/sgcc/features.csv'

if os.path.exists(features_path):
    features = pd.read_csv(features_path)
else:
    features = extract_features(data[data.date.dt.year == 2016], column_id='CONS_NO', column_sort='date', column_value='cons')
    impute(features)
    features.reset_index(inplace=True)
    features.rename({'index' : 'consumer'}, axis=1, inplace=True)
    features = fs.selection.select_features(features.set_index('consumer'), labels, ml_task='classification')

    
    


Feature Extraction: 100%|██████████| 25/25 [08:18<00:00, 19.92s/it]


In [None]:
    if 'Unnamed: 0' in features.columns():
        
    
    features = fs.selection.select_features(features.set_index('consumer'), labels, ml_task='classification')
    # features.to_csv(features_path)


# features = pd.merge(features, labels, left_on='consumer', right_on='CONS_NO')


In [None]:
relevance_table = fs.relevance.calculate_relevance_table(features.set_index('consumer'), labels, ml_task='classification')

In [None]:
important_features = fs.selection.select_features(features.set_index('consumer'), labels, ml_task='classification', )

In [None]:
normal_if = important_features.loc[labels[labels == 0].index]
bad_if = important_features.loc[labels[labels == 1].index]


In [None]:
x1 = normal_if.describe().loc['mean']
x2 = bad_if.describe().loc['mean']

plt.figure()

plt.show()

In [None]:
# X = features.drop('consumer', axis=1).to_numpy()
X = important_features.reset_index().drop('consumer', axis=1).to_numpy()
y = labels.to_numpy()

In [None]:
def plot2d(x, y=None, title=''):
    assert x.shape[1] == 2, 'not 2d data'
    plt.figure()
    sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, alpha=0.3)
    plt.title(title)
    plt.show()

In [None]:
Xpca = PCA(2).fit_transform(X)
Xtsne = TSNE(2).fit_transform(X)

plot2d(Xpca, y, 'pca all')
plot2d(Xtsne, y, 'tsne all')

In [None]:
X32 = PCA(32).fit_transform(X)
X2 = TSNE(2).fit_transform(X32)

plot2d(X2, y, 'pca(32) -> tsne ')

In [None]:
Xred = PCA(0.95).fit_transform(X)

In [None]:
Xnorm = X[y == 0, :]
Xbad = X[y == 1, :]
red_model = PCA(2).fit(Xnorm)
Xnorm2d = red_model.transform(X[y == 0, :])
Xbad2d = red_model.transform(X[y == 1, :])

plt.figure()
plt.scatter(Xnorm2d[:, 0], Xnorm2d[:, 1], c='b', marker='o', label='normal')
plt.scatter(Xbad2d[:, 0], Xbad2d[:, 1], c='r', marker='x', label='anomal')
plt.legend()
plt.show()


In [None]:
type(red_model)

In [None]:
# check consumer-outlier with considerably huge coordinate after PCA

idx = np.argmax(Xpca[:, 0])
consumer1 = labels.reset_index().iloc[idx].CONS_NO
print(idx, consumer)

idx = np.argmax(Xpca[:, 1])
consumer2 = labels.reset_index().iloc[idx].CONS_NO
print(idx, consumer)

Xcheck = important_features.drop([consumer1, consumer2], axis=0).reset_index().drop('consumer', axis=1).to_numpy()
Xcheck_pca = PCA(2).fit_transform(Xcheck)
plot2d(Xcheck_pca, y[1:-1], 'check')

In [None]:
Xpca_norm = Xpca - np.mean(Xpca, axis=0)

plot2d(Xpca_norm, y)

In [None]:
plt.figure()
plt.scatter(np.log(Xnorm2d[:, 0]), np.log(Xnorm2d[:, 1]), c='b', marker='o')
# plt.scatter(Xbad2d[:, 0], Xbad2d[:, 1], c='r', marker='x')
plt.show()

In [None]:
plt.figure()
plt.hist(Xnorm2d[:, 0], color='b', alpha=0.3, bins=20)
plt.hist(Xnorm2d[:, 1], color='r', alpha=0.3, bins=20)
plt.show()

In [None]:
plt.figure()
sns.jointplot(x=Xbad2d[:, 0], y=Xbad2d[:, 1], kind='kde')
plt.show()

In [None]:
# x = X2[y == 1, :]
x = Xnorm2d

plt.figure()
sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y[y==0], alpha=0.3)
plt.show()

In [None]:
# drops kernel

# i = 0
# N = 19
# for col in features.drop('consumer', axis=1).columns:
#     plt.figure()
#     sns.histplot(data=features, x=col, hue=y, color='b', alpha=0.3)
#     plt.title(col)
#     plt.show()
#     if i == N:
#         break

## Yellowbrick 

In [None]:
pca_vis = pca.PCA(heatmap=False)
pca_vis.fit(X, y=y)


xx = pca_vis.transform(X, y)
colors = ['b', 'r']
color = [colors[label] for label in y]
# markers = ['.', 'x']
# marker = [markers[label] for label in y]
# plt.scatter(xx[:, 0], xx[:, 1], color=color, marker='x')

pca_vis.show()

In [None]:
x_bad = X[y == 1, :]
N = x_bad.shape[0]
x_norm_subset = X[y == 0, :][-1-N:-1, :]

x_both = np.concatenate([x_bad, x_norm_subset], axis=0)
y_both = np.concatenate([np.ones(N), np.zeros(N)], axis=0)
# y_both = np.concatenate([['bad'] * N, ['norm'] * N], axis=0)

In [None]:
methods = ['lle', 'ltsa', 'hessian', 'modified', 'isomap', 'mds', 'spectral', 'tsne']

In [None]:
method = 'tsne'
manifold_vis = manifold_embedding(x_both, y_both, target_type='discrete', method=method)

In [None]:
manifold = feat.manifold.Manifold(manifold='spectral', target_type='discrete',  )
xp = manifold.fit_transform(x_both)

plt.figure()
sns.scatterplot(x=xp[:,0], y=xp[:,1], hue=y_both)
plt.show()

## Models

In [None]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, auc, precision_score, cohen_kappa_score
from sklearn import preprocessing


In [None]:
chinese_tsfresh_stats = pd.read_csv(os.path.join(PATH, './X_w_flags.csv'),)

In [None]:
plt.plot(chinese_tsfresh_stats.isnull().sum())

In [None]:
chinese_tsfresh_stats.drop(chinese_tsfresh_stats.columns[chinese_tsfresh_stats.isnull().sum() > 20000], axis=1, inplace=True)

In [None]:
chinese_tsfresh_stats = chinese_tsfresh_stats.fillna(chinese_tsfresh_stats.mean())

In [None]:
X = chinese_tsfresh_stats.iloc[:, 1:-1]
y = chinese_tsfresh_stats['flags']

In [None]:
def perform_metrics(y_true, y_pred, y_proba, show=True):
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba)
    precision = precision_score(y_true, y_pred)
    ck = cohen_kappa_score(y_true, y_pred)
    fpr, tpr, threshold = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    if show:
        print('F1 score: {}'.format(f1))
        print('Precision: {}'.format(precision))
        print('ROC-AUC: {}'.format(roc_auc))
        print('Cohen’s kappa score: {}'.format(ck))

        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()

    return [f1, roc_auc, precision, ck, fpr, tpr, threshold, roc_auc]

In [None]:
def kfold_train(X, y, model, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds)
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        print("Fold: {}".format(fold))
        fold_model = deepcopy(model)
        fold_model.fit(X_train, y_train)

        print("Train performance --->")
        train_perform = perform_metrics(y_train, 
                                        fold_model.predict(X_train), 
                                        fold_model.predict_proba(X_train)[:, 1],
                                        show=True)
        print("Test performance --->")
        test_perform = perform_metrics(y_test, 
                                fold_model.predict(X_test), 
                                fold_model.predict_proba(X_test)[:, 1],
                                show=True)
        print('\n\n')

### simple Logreg/SVM

In [None]:
scaler = preprocessing.StandardScaler().fit(X[X.columns[X.max() > 1]])

In [None]:
X[X.columns[X.max() > 1]] = scaler.transform(X[X.columns[X.max() > 1]])

In [None]:
len(y.loc[y == 0]) / len(y)

In [None]:
#w = {0:len(y.loc[y == 0]) / len(y), 1:(1 - len(y.loc[y == 0]) / len(y))}
lr_model = LogisticRegression(random_state=0, max_iter=10000) #class_weight=w

In [None]:
kfold_train(X, y, lr_model)

### Catboost

In [None]:
! pip3 install catboost

In [None]:
X = chinese_tsfresh_stats.iloc[:, 1:-1]
y = chinese_tsfresh_stats['flags']

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
catboost_model = CatBoostClassifier(silent=True)

In [None]:
kfold_train(X, y, catboost_model)

In [None]:
train_data = Pool(data=X,
                  label=y)

In [None]:
model = CatBoostClassifier()

In [None]:
model.fit(train_data)

In [None]:
f1_score(y, model.predict(X))

In [None]:
roc_auc_score(y, model.predict_proba(X)[:, 1])

In [None]:
fpr, tpr, threshold = roc_curve(y, model.predict_proba(X)[:, 1])
roc_auc = auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
