In [1]:
import pandas as pd
import numpy as np

import scipy.sparse
import sklearn.feature_extraction

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm
import platform

pd.set_option("display.max_rows", 10)
pd.set_option('display.max_columns', 1100)

import os

%pylab inline
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## Train part

### Load data from logs

In [5]:
from lib.parsers.logParser import LogParser

l_parser = LogParser(log_folder='Logs/')

main_data, values_data, order_data = l_parser.parse_train_sample(0, 10)

list_ua = pd.DataFrame(main_data).User_Agent.value_counts().index.tolist()

# For NaN Useragent
list_ua.append('0')

100%|██████████| 10/10 [00:07<00:00,  1.38it/s]


### Prepare train data

In [6]:
important_keys_set = {'Accept', 'Accept-Charset', 'Accept-Encoding'}

orders_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
values_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)

full_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_keys_set, fit_dict=True)

from sklearn import preprocessing

lb = preprocessing.LabelBinarizer(sparse_output=True)
lb.fit(list_ua)
y = lb.transform(pd.DataFrame(main_data).User_Agent.fillna('0'))

100%|██████████| 289269/289269 [00:03<00:00, 85643.96it/s]
 21%|██        | 60870/289269 [00:00<00:00, 608694.78it/s]

Sparse dummy orders shape: 
(289269, 2277)


100%|██████████| 289269/289269 [00:00<00:00, 596599.99it/s]


Sparse dummy values shape: 
(289269, 361)


### Train model

In [6]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

smart_clf = OneVsRestClassifier(LogisticRegression(C=100))
smart_clf.fit(full_sparce_dummy, y)

CPU times: user 14h 11min 36s, sys: 1min 5s, total: 14h 12min 41s
Wall time: 14h 16min 56s


### Save classifier

In [9]:
import os
from sklearn.externals import joblib

filename = 'cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl'
_ = joblib.dump(smart_clf, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

from lib.helpers.fileSplitter import split_file

files_count = split_file(filename, 'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

277520394

## Test part

### Prepare data (50/50 bots and human mixed)

In [7]:
main_bot_data, values_bot_data, order_bot_data = l_parser.parse_bot_sample(10, 20, 20, 30)

main_human_frame = pd.DataFrame(main_data)
main_human_frame['is_human'] = True

main_bot_frame = pd.DataFrame(main_bot_data)
main_bot_frame['is_human'] = False

main_all = pd.concat([main_human_frame, main_bot_frame])

values_all = values_data + values_bot_data
order_all = order_data + order_bot_data

list_all_ua = main_all.User_Agent.value_counts().index.tolist()

# For NaN Useragent
list_all_ua.append('0')

l_parser.reassign_orders_values(order_all, values_all)

test_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_keys_set, fit_dict=False)

lb.fit(list_all_ua)
y_test = lb.transform(pd.DataFrame(main_all).User_Agent.fillna('0'))

  0%|          | 0/10 [00:00<?, ?it/s]

Start parsing logs for distribution


100%|██████████| 10/10 [00:07<00:00,  1.32it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Start parsing logs for values


100%|██████████| 10/10 [00:07<00:00,  1.28it/s]
  0%|          | 22/877616 [00:00<1:07:46, 215.82it/s]

Bots Generation


100%|██████████| 877616/877616 [47:34<00:00, 307.50it/s]
100%|██████████| 1755232/1755232 [00:19<00:00, 91315.97it/s]
  4%|▎         | 63468/1755232 [00:00<00:02, 634676.07it/s]

Sparse dummy orders shape: 
(1755232, 2277)


100%|██████████| 1755232/1755232 [00:02<00:00, 687462.35it/s]


Sparse dummy values shape: 
(1755232, 361)


### Test predictions

### Load classifyer from file if needed

Use only `dummyordr_and_3values_fulluacls.joblib.pkl`

In [20]:
infiles = [
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.0',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.1',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.2',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.3',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.4',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.5'
]

import os
from sklearn.externals import joblib
from lib.helpers.fileSplitter import cat_files

cat_files(infiles, 'cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl')

filename = 'cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl'
smart_clf = joblib.load(filename)

In [None]:
from lib.thresholdPredictions import ThresholdPredictions

pred = ThresholdPredictions(user_agent_list=list_ua, clf=smart_clf)
y_test_names, y_predicted, compare_answers, is_bot, answers_count = pred.bot_predict(lb, test_sparce_dummy, y_test, 0.024072)

In [None]:
compare_frame = pd.concat(
    [
        y_test_names,
        y_predicted, 
        pd.DataFrame(compare_answers), 
        main_all.is_human,
        pd.DataFrame(is_bot), 
        pd.DataFrame(answers_count)
    ], keys=['test', 'predicted', 'correctness', 'is_bot_real', 'is_bot_predicted', 'count'], axis=1, join='inner')

compare_frame

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

headers_cm = confusion_matrix(compare_frame['is_bot_real'], compare_frame['is_bot_predicted'])

In [None]:
headers_FP = headers_cm.sum(axis=0) - np.diag(headers_cm)  
headers_FN = headers_cm.sum(axis=1) - np.diag(headers_cm)
headers_TP = np.diag(headers_cm)
headers_TN = headers_cm.values.sum() - (headers_FP + headers_FN + headers_TP)

In [None]:
print('TP: {}'.format(headers_TP))
print('TN: {}'.format(headers_TN))
print("FP: {}".format(headers_FP))
print("FN: {}".format(headers_FN))
print("Accuracy (ACC): {}".format((headers_TP + headers_TN) / (headers_TP + headers_TN + headers_FP + headers_FN)))
print("Sensitivity, hit rate, recall, or true positive rate (TPR): {}".format(headers_TP / (headers_TP + headers_FN)))
print("Precision or positive predictive value (PPV): {}".format(headers_TP / (headers_TP + headers_FP)))

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(headers_TN / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(headers_FN / y_test.shape[0]))

## Dimensionality reduction

In [None]:
l_parser.reassign_orders_values(values_data, order_data)

from sklearn.decomposition import PCA

pca = PCA(n_components=list_ua)

pairs_dict_list = []
for row_idx in tqdm(range(len(order_data)), mininterval=2):
    pairs_dict = {}
    for first_p, second_p in combinations(order_data[row_idx], 2):
        if order_data[row_idx][first_p] < order_data[row_idx][second_p]:
            pairs_dict['{0} < {1}'.format(first_p, second_p)] = 1
        else:
            pairs_dict['{0} < {1}'.format(second_p, first_p)] = 1
    pairs_dict_list.append(pairs_dict)
    
pca.fit(pairs_dict_list)
pairs_dict_list = pca.transform(pairs_dict_list)

sparse_dummy = orders_vectorizer.transform(pairs_dict_list).astype(np.int8)

print('Sparse dummy orders shape: \n{0}'.format(sparse_dummy.shape))

trimmed_values_data = []

for row_index in tqdm(range(len(values_data))):
    tmp_row = {}
    for key in important_keys_set:
        if key in values_data[row_index]:
            tmp_row[key] = values_data[row_index][key]
    trimmed_values_data.append(tmp_row)

sparse_dummy_values = values_vectorizer.transform(trimmed_values_data).astype(np.int8)

print('Sparse dummy values shape: \n{0}'.format(sparse_dummy_values.shape))

full_sparce_dummy = hstack((sparse_dummy, sparse_dummy_values))
full_sparce_dummy

## Another User Agent representation

### User Agent as tuple

#### From Udger 

`UserAgent = {ua_family_code, ua_version, ua_class_code, device_class_code, os_family_code, os_code}`

### Дообучение

Идея: Брать логи за последние 10 дней (нормально можно обучить) и замешивать в выборку юзерагенты из старых выборок которые входят в топ 200 старой выборки, но не входят в топ 200 новой

Для выборки по времени у 