In [1]:
import pandas as pd
import numpy as np

import scipy.sparse
import sklearn.feature_extraction

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm
import platform

pd.set_option("display.max_rows", 10)
pd.set_option('display.max_columns', 1100)

import os

%pylab inline
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## Train part

### Load data from logs

In [32]:
from lib.parsers.logParser import LogParser

l_parser = LogParser(log_folder='Logs/')

main_data, values_data, order_data = l_parser.parse_train_sample(0, 1)

list_ua = pd.DataFrame(main_data).User_Agent.value_counts().index.tolist()

# For NaN Useragent
list_ua.append('0')

100%|██████████| 1/1 [00:00<00:00,  1.42it/s]


In [92]:
order_data

[{'Accept': 3,
  'Accept-Encoding': 5,
  'Accept-Language': 6,
  'Referer': 4,
  'Upgrade-Insecure-Requests': 1,
  'User-Agent': 2},
 {'Accept': 2,
  'Accept-Encoding': 3,
  'From': 4,
  'Host': 5,
  'Pragma': 1,
  'User-Agent': 6},
 {'Accept': 1,
  'Accept-Encoding': 4,
  'Accept-Language': 5,
  'Cookie': 6,
  'Upgrade-Insecure-Requests': 2,
  'User-Agent': 3},
 {'Accept': 1,
  'Accept-Encoding': 3,
  'Accept-Language': 2,
  'Connection': 6,
  'Cookie': 5,
  'Referer': 4,
  'Upgrade-Insecure-Requests': 7},
 {'Accept-Encoding': 3,
  'Connection': 6,
  'Host': 4,
  'Pragma': 5,
  'Referer': 1,
  'User-Agent': 2},
 {'Accept': 1,
  'Accept-Charset': 4,
  'Accept-Encoding': 2,
  'Accept-Language': 3},
 {'Accept': 1, 'Accept-Encoding': 2},
 {'Accept-Encoding': 4,
  'Accept-Language': 3,
  'Connection': 5,
  'Cookie': 1,
  'User-Agent': 2},
 {'Accept': 3,
  'Accept-Charset': 4,
  'Accept-Encoding': 1,
  'Accept-Language': 2},
 {'Accept': 1,
  'Accept-Encoding': 3,
  'Accept-Language': 2,
  '

### Prepare train data

In [35]:
important_keys_set = {'Accept', 'Accept-Charset', 'Accept-Encoding'}

orders_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
values_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)

l_parser.reassign_orders_values(order_data[:20000], values_data[:20000])
full_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_keys_set, fit_dict=True)

from sklearn import preprocessing

lb = preprocessing.LabelBinarizer(sparse_output=True)
lb.fit(list_ua)
y = lb.transform(pd.DataFrame(main_data[:20000]).User_Agent.fillna('0'))

100%|██████████| 20000/20000 [00:00<00:00, 82984.45it/s]
100%|██████████| 20000/20000 [00:00<00:00, 533280.44it/s]


Sparse dummy orders shape: 
(20000, 1181)
Sparse dummy values shape: 
(20000, 128)


### Train model

In [36]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

smart_clf = OneVsRestClassifier(LogisticRegression(C=100))
smart_clf.fit(full_sparce_dummy, y)

CPU times: user 13min 49s, sys: 480 ms, total: 13min 49s
Wall time: 14min


### Save classifier

In [37]:
import os
from sklearn.externals import joblib

filename = 'cls/dummyordr_and_3values_fullua_logreg_20K_cls.joblib.pkl'
_ = joblib.dump(smart_clf, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

from lib.helpers.fileSplitter import split_file

files_count = split_file(filename, 'parted-cls/dummyordr_and_3values_fullua_logreg_20K_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 20977477
Splitted in 0 files


## Test part

### Prepare data (50/50 bots and human mixed)

In [38]:
main_bot_data, values_bot_data, order_bot_data = l_parser.parse_bot_sample(10, 11, 20, 21)

main_human_frame = pd.DataFrame(main_data[:20000])
main_human_frame['is_human'] = True

main_bot_frame = pd.DataFrame(main_bot_data[:20000])
main_bot_frame['is_human'] = False

main_all = pd.concat([main_human_frame, main_bot_frame])

values_all = values_data[:20000] + values_bot_data[:20000]
order_all = order_data[:20000] + order_bot_data[:20000]

list_all_ua = main_all.User_Agent.value_counts().index.tolist()

# For NaN Useragent
list_all_ua.append('0')

l_parser.reassign_orders_values(order_all, values_all)

test_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_keys_set, fit_dict=False)

lb.fit(list_all_ua)
y_test = lb.transform(pd.DataFrame(main_all).User_Agent.fillna('0'))

100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Start parsing logs for distribution


100%|██████████| 1/1 [00:00<00:00,  1.87it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Start parsing logs for values


100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
  0%|          | 131/75996 [00:00<00:57, 1308.79it/s]

Bots Generation


100%|██████████| 75996/75996 [00:48<00:00, 1575.30it/s]
100%|██████████| 151992/151992 [00:01<00:00, 89733.78it/s]
 41%|████▏     | 62982/151992 [00:00<00:00, 629811.59it/s]

Sparse dummy orders shape: 
(151992, 1181)


100%|██████████| 151992/151992 [00:00<00:00, 642704.63it/s]


Sparse dummy values shape: 
(151992, 128)


### Test predictions

### Load classifyer from file if needed

Use only `dummyordr_and_3values_fulluacls.joblib.pkl`

In [7]:
infiles = [
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.0',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.1',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.2',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.3',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.4',
    'parted-cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl.5'
]

import os
from sklearn.externals import joblib
from lib.helpers.fileSplitter import cat_files

cat_files(infiles, 'cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl')

filename = 'cls/dummyordr_and_3values_fullua_logreg_cls.joblib.pkl'
smart_clf = joblib.load(filename)

In [None]:
import psutil

mem = psutil.virtual_memory()
if (test_sparce_dummy.shape[0] * list_ua * 8 > mem.free) and memory_warn:
    print("Not enought memory for predict proba calculation")
predictions_proba = smart_clf.predict_proba(test_sparce_dummy)

We can't calculate full sample. Because we hawven't enought memory.

So we try to test top 20000 from each samples

In [5]:
main_all = pd.concat([main_human_frame, main_bot_frame]).head(20000)

values_all = values_data[:20000] + values_bot_data[:20000]
order_all = order_data[:20000] + order_bot_data[:20000]

list_all_ua = main_all.User_Agent.value_counts().index.tolist()

# For NaN Useragent
list_all_ua.append('0')

l_parser.reassign_orders_values(order_all, values_all)

test_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_keys_set, fit_dict=False)

lb.fit(list_all_ua)
y_test = lb.transform(pd.DataFrame(main_all).User_Agent.fillna('0'))

test_sparce_dummy.shape

100%|██████████| 40000/40000 [00:00<00:00, 85990.17it/s]
100%|██████████| 40000/40000 [00:00<00:00, 663317.22it/s]

Sparse dummy orders shape: 
(40000, 2277)
Sparse dummy values shape: 
(40000, 361)





(40000, 2638)

### Test predictions proba

In [28]:
%%time

import psutil

mem = psutil.virtual_memory()
if (test_sparce_dummy.shape[0] * len(list_ua) * 8 > mem.free) and memory_warn:
    print("Not enought memory for predict proba calculation")
predictions_proba = smart_clf.predict_proba(test_sparce_dummy)

CPU times: user 3min 41s, sys: 9.1 s, total: 3min 50s
Wall time: 3min 50s


In [None]:
%%time

from lib.thresholdPredictions import ThresholdPredictions

pred = ThresholdPredictions(user_agent_list=list_ua, clf=smart_clf)
y_test_names, y_predicted, compare_answers, is_bot, answers_count = pred.bot_predict(lb, test_sparce_dummy, y_test, 0.024072)

In [None]:
print(compare_frame.iloc[0][0])
print()
for u_a in compare_frame.iloc[0][1]:
    print(u_a)

In [None]:
compare_frame = pd.concat(
    [
        pd.DataFrame(y_test_names),
        y_predicted, 
        pd.DataFrame(compare_answers), 
        main_all[:20000].is_human,
        pd.DataFrame(is_bot), 
        pd.DataFrame(answers_count)
    ], keys=['test', 'predicted', 'correctness', 'is_human', 'is_bot_predicted', 'count'], axis=1, join='inner')

compare_frame

In [64]:
compare_frame

Unnamed: 0_level_0,test,predicted,correctness,is_human,is_bot_predicted,count
Unnamed: 0_level_1,0,0,0,is_human,0,0
0,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
1,Mozilla/5.0 (compatible; bingbot/2.0; +http://...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
2,Mozilla/5.0 (Linux; Android 4.4.2; Zera S Buil...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
3,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) G...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
4,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
...,...,...,...,...,...,...
19995,Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
19996,Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
19997,Mozilla/5.0 (compatible; MJ12bot/v1.4.7; http:...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304
19998,Mozilla/5.0 (iPad; CPU OS 8_1 like Mac OS X) A...,[Mozilla/5.0 (compatible; YandexBot/3.0; +http...,True,True,False,3304


In [65]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

headers_cm = confusion_matrix(~compare_frame.is_human, compare_frame.is_bot_predicted)

In [66]:
headers_FP = headers_cm.sum(axis=0) - np.diag(headers_cm)  
headers_FN = headers_cm.sum(axis=1) - np.diag(headers_cm)
headers_TP = np.diag(headers_cm)
headers_TN = headers_cm.sum() - (headers_FP + headers_FN + headers_TP)

In [67]:
print('TP: {}'.format(headers_TP))
print('TN: {}'.format(headers_TN))
print("FP: {}".format(headers_FP))
print("FN: {}".format(headers_FN))
print("Accuracy (ACC): {}".format((headers_TP + headers_TN) / (headers_TP + headers_TN + headers_FP + headers_FN)))
print("Sensitivity, hit rate, recall, or true positive rate (TPR): {}".format(headers_TP / (headers_TP + headers_FN)))
print("Precision or positive predictive value (PPV): {}".format(headers_TP / (headers_TP + headers_FP)))

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(headers_TN / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(headers_FN / y_test.shape[0]))

TP: [15574     0]
TN: [    0 15574]
FP: [   0 4426]
FN: [4426    0]
Accuracy (ACC): [ 0.7787  0.7787]
Sensitivity, hit rate, recall, or true positive rate (TPR): [ 0.7787     nan]
Precision or positive predictive value (PPV): [ 1.  0.]
Ошибка первого рода (когда мы принимаем нормального пользователя за бота): [ 0.          0.10246592]
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): [ 0.02911995  0.        ]


### As we see. This is wrong wrong wrong way. 

Even with 0 threshold we have only 50% True positive classification. It means that fulltext UserAgent 

### Soo we need аnother User Agent representation

In [88]:
pca.fit(orders_dummy)
#sparse_dummy = scipy.sparse.csr_matrix(pca.transform(sparse_dummy))
sparse_dummy = pca.transform(orders_dummy)

print('Sparse dummy orders shape: \n{0}'.format(sparse_dummy.shape))

trimmed_values_data = []

for row_index in tqdm(range(len(values_data))):
    tmp_row = {}
    for key in important_keys_set:
        if key in values_data[row_index]:
            tmp_row[key] = values_data[row_index][key]
    trimmed_values_data.append(tmp_row)

values_vectorizer.fit(trimmed_values_data)
sparse_dummy_values = values_vectorizer.transform(trimmed_values_data).astype(np.int8)

print('Sparse dummy values shape: \n{0}'.format(sparse_dummy_values.shape))

full_sparce_dummy = hstack((sparse_dummy, sparse_dummy_values))
full_sparce_dummy

100%|██████████| 75996/75996 [00:00<00:00, 459741.57it/s]

Sparse dummy orders shape: 
(75996, 350)
Sparse dummy values shape: 
(75996, 207)





ValueError: all the input arrays must have same number of dimensions

In [89]:
sparse_dummy

array([[-0.00488503,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00488503,  0.        , -0.01361608, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00488503,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.01332112],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00488503,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [90]:
l_parser1 = LogParser(log_folder='Logs/')

main_data1, values_data1, order_data1 = l_parser1.parse_train_sample(0, 1)

from sklearn.decomposition import SparsePCA

from itertools import combinations

orders_vectorizer1 = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
values_vectorizer1 = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
pca1 = SparsePCA(n_components=350)

pairs_dict_list1 = []
for row_idx in tqdm(range(len(order_data1)), mininterval=2):
    pairs_dict = {}
    for first_p, second_p in combinations(order_data1[row_idx], 2):
        if order_data1[row_idx][first_p] < order_data1[row_idx][second_p]:
            pairs_dict['{0} < {1}'.format(first_p, second_p)] = 1
        else:
            pairs_dict['{0} < {1}'.format(second_p, first_p)] = 1
    pairs_dict_list1.append(pairs_dict)

orders_vectorizer1.fit(pairs_dict_list1)
orders_dummy1 = orders_vectorizer1.transform(pairs_dict_list1).astype(np.int8)

100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
100%|██████████| 29903/29903 [00:00<00:00, 70838.03it/s]


In [91]:
pca.fit(orders_dummy1)
#sparse_dummy = scipy.sparse.csr_matrix(pca.transform(sparse_dummy))
sparse_dummy1 = pca.transform(orders_dummy1)

print('Sparse dummy orders shape: \n{0}'.format(sparse_dummy1.shape))

trimmed_values_data1 = []

for row_index in tqdm(range(len(values_data1))):
    tmp_row = {}
    for key in important_keys_set:
        if key in values_data1[row_index]:
            tmp_row[key] = values_data1[row_index][key]
    trimmed_values_data1.append(tmp_row)

values_vectorizer1.fit(trimmed_values_data1)
sparse_dummy_values1 = values_vectorizer1.transform(trimmed_values_data1).astype(np.int8)

print('Sparse dummy values shape: \n{0}'.format(sparse_dummy_values1.shape))

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:

full_sparce_dummy = hstack((sparse_dummy, sparse_dummy_values))
full_sparce_dummy