In [1]:
import pandas as pd
import numpy as np

import scipy.sparse
import sklearn.feature_extraction

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm
import platform

pd.set_option("display.max_rows", 10)
pd.set_option('display.max_columns', 1100)

import os

%pylab inline
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## Train part

### Load data from logs

In [2]:
from lib.parsers.logParser import LogParser

l_parser = LogParser(log_folder='Logs/')

main_data, values_data, order_data = l_parser.parse_train_sample(0, 30)

list_ua = pd.DataFrame(main_data).User_Agent.value_counts().index.tolist()

# For NaN Useragent
list_ua.append('0')

100%|██████████| 30/30 [00:24<00:00,  1.31it/s]


In [3]:
order_keys = {'Accept'}

for header in tqdm(order_data):
    for key in header.keys():
        order_keys.add(key)
        
len(order_keys)

100%|██████████| 29903/29903 [00:00<00:00, 475915.41it/s]


103

### Prepare train data

In [3]:
important_values_keys_set = {
    'Accept', 
    'Accept-Charset', 
    'Accept-Encoding'
}
important_orders_keys_set = {
    'Upgrade-Insecure-Requests',
    'Accept', 
    'If-Modified-Since',
    'Host', 
    'Connection', 
    'User-Agent', 
    'From', 
    'Accept-Encoding' 
}

orders_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
values_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)

l_parser.reassign_orders_values(order_data, values_data)
full_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_orders_keys_set, important_values_keys_set, fit_dict=True)

from sklearn import preprocessing

lb = preprocessing.LabelBinarizer(sparse_output=True)
lb.fit(list_ua)
y = lb.transform(pd.DataFrame(main_data).User_Agent.fillna('0'))

100%|██████████| 877616/877616 [00:02<00:00, 392324.47it/s]
100%|██████████| 877616/877616 [00:04<00:00, 206135.61it/s]
  6%|▋         | 56103/877616 [00:00<00:01, 561027.86it/s]

Sparse dummy orders shape: 
(877616, 54)


100%|██████████| 877616/877616 [00:01<00:00, 587345.61it/s]


Sparse dummy values shape: 
(877616, 569)


### Train model

In [None]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

smart_clf = OneVsRestClassifier(LogisticRegression(C=100))
smart_clf.fit(full_sparce_dummy, y)

### Save classifier

In [None]:
import os
from sklearn.externals import joblib

filename = 'cls/dummyordr_and_values_fullua_logreg_cls(one_vs_rest).joblib.pkl'
_ = joblib.dump(smart_clf, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

from lib.helpers.fileSplitter import split_file

files_count = split_file(filename, 'parted-cls/dummyordr_and_values_fullua_logreg_cls(one_vs_rest).joblib.pkl')

print('Splitted in {} files'.format(files_count))

## Test part

### Prepare data (50/50 bots and human mixed)

In [36]:
# -- HUMANS --

l_parser = LogParser(log_folder='Logs/')

main_data, values_data, order_data = l_parser.parse_train_sample(35, 36)

# -- BOTS --

l_parser = LogParser(log_folder='Logs/')

main_bot_data, values_bot_data, order_bot_data = l_parser.parse_bot_sample(40, 41, 50, 51)

main_all = pd.DataFrame(main_data[:20000] + main_bot_data[:20000].to_dict('records'))
main_all['is_human'] = False
main_all.loc[0:19999,'is_human'] = True

values_all = values_data[:20000] + values_bot_data[:20000]
order_all = order_data[:20000] + order_bot_data[:20000]

list_all_ua = main_all.User_Agent.value_counts().index.tolist()

# For NaN Useragent
list_all_ua.append('0')

l_parser.reassign_orders_values(order_all, values_all)

test_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_orders_keys_set, important_values_keys_set, fit_dict=False)

lb_test = preprocessing.LabelBinarizer(sparse_output=True)
lb_test.fit(list_all_ua)
y_test = lb_test.transform(main_all.User_Agent.fillna('0'))

test_sparce_dummy.shape

100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Start parsing logs for distribution


100%|██████████| 1/1 [00:00<00:00,  2.04it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Start parsing logs for values


100%|██████████| 1/1 [00:00<00:00,  1.83it/s]
  0%|          | 164/46093 [00:00<00:28, 1636.03it/s]

Bots Generation


100%|██████████| 46093/46093 [00:29<00:00, 1587.90it/s]
100%|██████████| 40000/40000 [00:00<00:00, 411989.86it/s]
100%|██████████| 40000/40000 [00:00<00:00, 214565.18it/s]
100%|██████████| 40000/40000 [00:00<00:00, 636699.24it/s]

Sparse dummy orders shape: 
(40000, 53)





Sparse dummy values shape: 
(40000, 128)


(40000, 181)

In [7]:
print(y_test.shape)
main_all

(40000, 5048)


Unnamed: 0,User_Agent,ip,timestamp,is_human
0,HybridBot (hybrid.ru/about. If our bot caused ...,212.8.236.60,1486332013,True
1,Mozilla/5.0 (compatible; GrapeshotCrawler/2.0;...,89.145.95.78,1486332024,True
2,Mozilla/5.0 (compatible; MSIE 10.0; Windows Ph...,95.56.172.37,1486332032,True
3,Mozilla/5.0 (compatible; Yahoo! Slurp; http://...,68.180.228.117,1486332049,True
4,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,66.249.64.184,1486332055,True
...,...,...,...,...
39995,Mozilla/5.0 (compatible; YandexBot/3.0; +http:...,178.71.80.105,1486845204,False
39996,Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like...,178.207.255.104,1486845206,False
39997,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,66.249.64.217,1486845212,False
39998,Mozilla/5.0 (compatible; AhrefsBot/5.2; +http:...,40.77.167.29,1486845217,False


### Test predictions

### Load classifyer from file if needed

Use only `dummyordr_and_values_fullua_logreg_cls.joblib.pkl`

In [10]:
infiles = [
    'parted-cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl.0',
    'parted-cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl.1',
    'parted-cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl.2',
    'parted-cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl.3',
    'parted-cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl.4',
    'parted-cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl.5'
]

import os
from sklearn.externals import joblib
from lib.helpers.fileSplitter import cat_files

cat_files(infiles, 'cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl')

filename = 'cls/dummyordr_and_values_fullua_logreg_cls.joblib.pkl'
smart_clf = joblib.load(filename)

We can't calculate full sample. Because we hawven't enought memory.

So we try to test top 20000 from each samples

### Test predictions proba (Threshold 0.024)

In [37]:
from lib.thresholdPredictions import ThresholdPredictions

pred = ThresholdPredictions(user_agent_list=lb.classes_.tolist(), full_user_agent_list=lb_test.classes_.tolist(), clf=smart_clf)
y_test_names, y_predicted, compare_answers, is_bot, answers_count = pred.bot_predict(test_sparce_dummy, y_test, 0.024072, sparce_y=True)

40000it [00:03, 10474.34it/s]


In [38]:
compare_frame = pd.concat(
    [
        pd.DataFrame(y_test_names),
        y_predicted, 
        pd.DataFrame(compare_answers),  
        #pd.DataFrame(main_all[['is_human']]), 
        pd.DataFrame(is_bot), 
        pd.DataFrame(answers_count)
    ], keys=['test', 'predicted', 'correctness', 'is_bot_predicted', 'count'], axis=1, join='inner')

    
compare_frame['is_human'] = False
compare_frame.loc[0:19999,'is_human'] = True
compare_frame

Unnamed: 0_level_0,test,predicted,correctness,is_bot_predicted,count,is_human
Unnamed: 0_level_1,0,0,0,0,0,Unnamed: 6_level_1
0,HybridBot (hybrid.ru/about. If our bot caused ...,[HybridBot (hybrid.ru/about. If our bot caused...,True,False,1,True
1,Mozilla/5.0 (compatible; GrapeshotCrawler/2.0;...,[Mozilla/5.0 (compatible; GrapeshotCrawler/2.0...,True,False,1,True
2,Mozilla/5.0 (compatible; MSIE 10.0; Windows Ph...,[Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:...,False,False,8,True
3,Mozilla/5.0 (compatible; Yahoo! Slurp; http://...,[Mozilla/5.0 (compatible; Yahoo! Slurp; http:/...,True,False,1,True
4,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,[Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X B...,True,False,2,True
...,...,...,...,...,...,...
39995,Mozilla/5.0 (compatible; YandexBot/3.0; +http:...,[Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) ...,False,True,1,False
39996,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,[Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537....,False,True,1,False
39997,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,[Mozilla/5.0 (compatible; Googlebot/2.1; +http...,False,True,1,False
39998,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,[Mozilla/5.0 (compatible; bingbot/2.0; +http:/...,False,True,2,False


In [39]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

headers_cm = confusion_matrix(~compare_frame.is_human, compare_frame.is_bot_predicted)

In [40]:
headers_FP = headers_cm.sum(axis=0) - np.diag(headers_cm)  
headers_FN = headers_cm.sum(axis=1) - np.diag(headers_cm)
headers_TP = np.diag(headers_cm)
headers_TN = headers_cm.sum() - (headers_FP + headers_FN + headers_TP)

In [41]:
print('TP: {}'.format(headers_TP))
print('TN: {}'.format(headers_TN))
print("FP: {}".format(headers_FP))
print("FN: {}".format(headers_FN))
print("Accuracy (ACC): {}".format((headers_TP + headers_TN) / (headers_TP + headers_TN + headers_FP + headers_FN)))
print("Sensitivity, hit rate, recall, or true positive rate (TPR): {}".format(headers_TP / (headers_TP + headers_FN)))
print("Precision or positive predictive value (PPV): {}".format(headers_TP / (headers_TP + headers_FP)))

TP: [15363 13641]
TN: [13641 15363]
FP: [6359 4637]
FN: [4637 6359]
Accuracy (ACC): [ 0.7251  0.7251]
Sensitivity, hit rate, recall, or true positive rate (TPR): [ 0.76815  0.68205]
Precision or positive predictive value (PPV): [ 0.70725532  0.74630704]


In [42]:
first_kind_error = []
second_kind_error = []
for i, bot_prediction in enumerate(is_bot):
    cur_first_kind_error = False
    cur_second_kind_error = False
    if bot_prediction and i<20000:
        cur_first_kind_error = True
    if not bot_prediction and i>19999:
        cur_second_kind_error = True
    first_kind_error.append(cur_first_kind_error)
    second_kind_error.append(cur_second_kind_error)    

compare_frame['first_kind_error'] = first_kind_error
compare_frame['second_kind_error'] = second_kind_error
compare_frame

Unnamed: 0_level_0,test,predicted,correctness,is_bot_predicted,count,is_human,first_kind_error,second_kind_error
Unnamed: 0_level_1,0,0,0,0,0,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,HybridBot (hybrid.ru/about. If our bot caused ...,[HybridBot (hybrid.ru/about. If our bot caused...,True,False,1,True,False,False
1,Mozilla/5.0 (compatible; GrapeshotCrawler/2.0;...,[Mozilla/5.0 (compatible; GrapeshotCrawler/2.0...,True,False,1,True,False,False
2,Mozilla/5.0 (compatible; MSIE 10.0; Windows Ph...,[Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:...,False,False,8,True,False,False
3,Mozilla/5.0 (compatible; Yahoo! Slurp; http://...,[Mozilla/5.0 (compatible; Yahoo! Slurp; http:/...,True,False,1,True,False,False
4,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,[Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X B...,True,False,2,True,False,False
...,...,...,...,...,...,...,...,...
39995,Mozilla/5.0 (compatible; YandexBot/3.0; +http:...,[Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) ...,False,True,1,False,False,False
39996,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,[Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537....,False,True,1,False,False,False
39997,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,[Mozilla/5.0 (compatible; Googlebot/2.1; +http...,False,True,1,False,False,False
39998,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,[Mozilla/5.0 (compatible; bingbot/2.0; +http:/...,False,True,2,False,False,False


## Threshold 0.024

In [43]:
print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(sum(first_kind_error) / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(sum(second_kind_error) / y_test.shape[0]))

Ошибка первого рода (когда мы принимаем нормального пользователя за бота): 0.115925
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): 0.158975


### Threshold 0.00075

In [61]:
pred = ThresholdPredictions(user_agent_list=lb.classes_.tolist(), full_user_agent_list=lb_test.classes_.tolist(), clf=smart_clf)
y_test_names, y_predicted, compare_answers, is_bot, answers_count = pred.bot_predict(test_sparce_dummy, y_test, 0.00078, sparce_y=True)

40000it [00:03, 10086.17it/s]


In [62]:
compare_frame = pd.concat(
    [
        pd.DataFrame(y_test_names),
        y_predicted, 
        pd.DataFrame(compare_answers),  
        #pd.DataFrame(main_all[['is_human']]), 
        pd.DataFrame(is_bot), 
        pd.DataFrame(answers_count)
    ], keys=['test', 'predicted', 'correctness', 'is_bot_predicted', 'count'], axis=1, join='inner')

    
compare_frame['is_human'] = False
compare_frame.loc[0:19999,'is_human'] = True

headers_cm = confusion_matrix(~compare_frame.is_human, compare_frame.is_bot_predicted)

headers_FP = headers_cm.sum(axis=0) - np.diag(headers_cm)  
headers_FN = headers_cm.sum(axis=1) - np.diag(headers_cm)
headers_TP = np.diag(headers_cm)
headers_TN = headers_cm.sum() - (headers_FP + headers_FN + headers_TP)

print('TP: {}'.format(headers_TP))
print('TN: {}'.format(headers_TN))
print("FP: {}".format(headers_FP))
print("FN: {}".format(headers_FN))
print("Accuracy (ACC): {}".format((headers_TP + headers_TN) / (headers_TP + headers_TN + headers_FP + headers_FN)))
print("Sensitivity, hit rate, recall, or true positive rate (TPR): {}".format(headers_TP / (headers_TP + headers_FN)))
print("Precision or positive predictive value (PPV): {}".format(headers_TP / (headers_TP + headers_FP)))

first_kind_error = []
second_kind_error = []
for i, bot_prediction in enumerate(is_bot):
    cur_first_kind_error = False
    cur_second_kind_error = False
    if bot_prediction and i<20000:
        cur_first_kind_error = True
    if not bot_prediction and i>19999:
        cur_second_kind_error = True
    first_kind_error.append(cur_first_kind_error)
    second_kind_error.append(cur_second_kind_error)    

compare_frame['first_kind_error'] = first_kind_error
compare_frame['second_kind_error'] = second_kind_error

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(sum(first_kind_error) / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(sum(second_kind_error) / y_test.shape[0]))

TP: [17963 13332]
TN: [13332 17963]
FP: [6668 2037]
FN: [2037 6668]
Accuracy (ACC): [ 0.782375  0.782375]
Sensitivity, hit rate, recall, or true positive rate (TPR): [ 0.89815  0.6666 ]
Precision or positive predictive value (PPV): [ 0.72928424  0.86746047]
Ошибка первого рода (когда мы принимаем нормального пользователя за бота): 0.050925
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): 0.1667


Also we try to determine the optimal volume of the training sample.

- Threshold vas fixed at 0.00075
- We try to hold first kind error about 0.05

**Here are results:**

| Training sample volume | Training time | First Kind Error | Second Kind Error |               Comments                | Conclusion |
|------------------------|---------------|------------------|-------------------|---------------------------------------|------------|
| 20 000 | ~12min | 0.050925 | 0.1667 | First 20 000 rows from first log file | Pretty fine and fast train |
| 877 616 | ~15hour 20min  | 0.050925 | 0.1667 | First 30 log files  | Оverfitting |
| 877 616 | ~15hour 20min  | 0.050925 | 0.1667 | First 30 log files  | Оverfitting |