In [1]:
import pandas as pd
import numpy as np

import scipy.sparse
import sklearn.feature_extraction

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm
import platform

pd.set_option("display.max_rows", 10)
pd.set_option('display.max_columns', 1100)

import os

%pylab inline
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### Load data to train

In [2]:
main_data = np.load('df/main_data.npy').tolist()
values_data = np.load('df/values_data.npy').tolist()
order_data = np.load('df/order_data.npy').tolist()

main_df = pd.DataFrame(main_data)

main_df

Unnamed: 0,device_class_code,ip,os_code,os_family_code,timestamp,ua_class_code,ua_family_code,ua_string,ua_version
0,desktop,95.181.252.91,windows_7,windows,1485900038,browser,chrome,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,chrome55.0.2883.87
1,smartphone,31.148.3.19,android_4_4,android,1485900079,mobile_browser,chrome_mobile,Mozilla/5.0 (Linux; Android 4.4.2; Zera S Buil...,chrome_mobile49.0.2623.105
2,desktop,188.162.183.66,windows_7,windows,1485900103,browser,firefox,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) G...,firefox51.0
3,desktop,37.144.52.103,windows_xp,windows,1485900105,browser,ie,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,ie8.0
4,desktop,91.201.178.51,windows_8_1,windows,1485900108,browser,opera,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,opera40.0.2308.81
...,...,...,...,...,...,...,...,...,...
571140,smartphone,89.237.103.208,android_6,android,1488585286,mobile_browser,chrome_mobile,Mozilla/5.0 (Linux; Android 6.0.1; SM-G925F Bu...,chrome_mobile56.0.2924.87
571141,desktop,141.0.12.169,windows_98,windows,1488585377,browser,opera,Mozilla/4.0 (Windows 98; US) Opera 12.16 [en],opera12.16
571142,desktop,131.253.24.147,windows_7,windows,1488585395,browser,ie,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,ie9.0
571143,desktop,176.195.122.100,windows_7,windows,1488585447,browser,ie,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,ie8.0


In [4]:
with open("df/group_labels/list_ua_version.txt", 'r') as f:
    list_ua_version = [line.rstrip('\n') for line in f]
    
with open("df/group_labels/list_ua_family_code.txt", 'r') as f:
    list_ua_family_code = [line.rstrip('\n') for line in f]

with open("df/group_labels/list_ua_class_code.txt", 'r') as f:
    list_ua_class_code = [line.rstrip('\n') for line in f]
    
with open("df/group_labels/list_os_code.txt", 'r') as f:
    list_os_code = [line.rstrip('\n') for line in f]
    
with open("df/group_labels/list_os_family_code.txt", 'r') as f:
    list_os_family_code = [line.rstrip('\n') for line in f]
    
with open("df/group_labels/list_device_class_code.txt", 'r') as f:
    list_device_class_code = [line.rstrip('\n') for line in f]

print("Device count: {}".format(len(list_device_class_code)))
print("Device platform family count: {}".format(len(list_os_family_code)))
print("Device platform count: {}".format(len(list_os_code)))
print("Device browser class count: {}".format(len(list_ua_class_code)))
print("Device browser family count: {}".format(len(list_ua_family_code)))
print("Device browser version count: {}".format(len(list_ua_version)))

Device count: 8
Device platform family count: 37
Device platform count: 109
Device browser class count: 10
Device browser family count: 193
Device browser version count: 5790


In [5]:
important_orders_keys_set = {
    'Upgrade-Insecure-Requests',
    'Accept', 
    'If-Modified-Since',
    'Host', 
    'Connection', 
    'User-Agent', 
    'From', 
    'Accept-Encoding' 
}

important_values_keys_set = {
    'Accept', 
    'Accept-Charset', 
    'Accept-Encoding'
}

In [6]:
from lib.parsers.logParser import LogParser

orders_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
values_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)

l_parser = LogParser(log_folder='Logs/')
l_parser.reassign_orders_values(order_data, values_data)

full_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_orders_keys_set, important_values_keys_set, fit_dict=True)

100%|██████████| 571145/571145 [00:02<00:00, 217854.16it/s]
100%|██████████| 571145/571145 [00:04<00:00, 127953.06it/s]
  3%|▎         | 18149/571145 [00:00<00:03, 181093.82it/s]

Sparse dummy orders shape: 
(571145, 50)


100%|██████████| 571145/571145 [00:01<00:00, 366313.77it/s]


Sparse dummy values shape: 
(571145, 516)


### Save vectorizers

In [34]:
import os
from sklearn.externals import joblib
from lib.helpers.fileSplitter import split_file

filename = 'cls/orders_vectorizer30.joblib.pkl'
_ = joblib.dump(orders_vectorizer, filename, compress=9)

print("Orders Vectorizer saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/orders_vectorizer30.joblib.pkl')

print('Splitted in {} files'.format(files_count))

filename = 'cls/values_vectorizer30.joblib.pkl'
_ = joblib.dump(values_vectorizer, filename, compress=9)

print("Values Vectorizer saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/values_vectorizer30.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Orders Vectorizer saved with size(Bytes): 761
Splitted in 0 files
Values Vectorizer saved with size(Bytes): 10842
Splitted in 0 files


### Train and on groups

#### Device_class_code

In [8]:
%%time

from sklearn.linear_model import LogisticRegression

clf_device_class_code = LogisticRegression(random_state=42, C=100)
clf_device_class_code.fit(full_sparce_dummy, main_df.device_class_code.fillna('NaN'))

CPU times: user 4min 23s, sys: 460 ms, total: 4min 24s
Wall time: 5min 13s


In [22]:
filename = 'cls/device_class_code_logreg_cls.joblib.pkl'
_ = joblib.dump(clf_device_class_code, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/device_class_code_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 25570
Splitted in 0 files


#### OS_family_code

In [9]:
%%time

clf_os_family_code = LogisticRegression(random_state=42, C=100)
clf_os_family_code.fit(full_sparce_dummy, main_df.os_family_code.fillna('NaN'))

CPU times: user 6min 41s, sys: 556 ms, total: 6min 41s
Wall time: 10min 35s


In [23]:
filename = 'cls/os_family_code_logreg_cls.joblib.pkl'
_ = joblib.dump(clf_os_family_code, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/os_family_code_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 90780
Splitted in 0 files


#### OS_code

In [10]:
%%time

clf_os_code = LogisticRegression(random_state=42, C=100)
clf_os_code.fit(full_sparce_dummy, main_df.os_code.fillna('NaN'))

CPU times: user 28min 41s, sys: 1.56 s, total: 28min 43s
Wall time: 32min 20s


In [24]:
filename = 'cls/os_code_logreg_cls.joblib.pkl'
_ = joblib.dump(clf_os_code, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/os_code_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 289589
Splitted in 0 files


#### Browser class_code

In [14]:
%%time

clf_ua_class_code = LogisticRegression(random_state=42, C=100)
clf_ua_class_code.fit(full_sparce_dummy, main_df.ua_class_code.fillna('NaN'))

CPU times: user 3min 24s, sys: 116 ms, total: 3min 24s
Wall time: 3min 32s


In [25]:
filename = 'cls/ua_class_code_logreg_cls.joblib.pkl'
_ = joblib.dump(clf_ua_class_code, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/ua_class_code_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 30923
Splitted in 0 files


#### Browser family_code

In [12]:
%%time

clf_ua_family_code = LogisticRegression(random_state=42, C=100)
clf_ua_family_code.fit(full_sparce_dummy, main_df.ua_family_code.fillna('NaN'))

CPU times: user 26min 20s, sys: 1.42 s, total: 26min 22s
Wall time: 30min 44s


In [27]:
filename = 'cls/ua_family_code_logreg_cls.joblib.pkl'
_ = joblib.dump(clf_ua_family_code, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/ua_family_code_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 432055
Splitted in 0 files


#### Browser version

In [13]:
%%time

clf_ua_version = LogisticRegression(random_state=42, C=100)
clf_ua_version.fit(full_sparce_dummy, main_df.ua_version.fillna('NaN'))

CPU times: user 8h 39min 53s, sys: 24.8 s, total: 8h 40min 18s
Wall time: 10h 25min 39s


In [28]:
filename = 'cls/ua_version_logreg_cls.joblib.pkl'
_ = joblib.dump(clf_ua_version, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/ua_version_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 12366246
Splitted in 0 files


### Load test data and free train data

In [35]:
del(main_df)
del(main_data)
del(values_data)
del(order_data)
del(full_sparce_dummy)

In [2]:
main2_data = np.load('df/main2_data.npy').tolist()
values2_data = np.load('df/values2_data.npy').tolist()
order2_data = np.load('df/order2_data.npy').tolist()

main2_df = pd.DataFrame(main2_data)

main2_df

Unnamed: 0,User_Agent,device_class_code,ip,os_code,os_family_code,timestamp,ua_class_code,ua_family_code,ua_version
0,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,desktop,62.89.6.193,windows_7,windows,1488578454,browser,chrome,chrome56.0.2924.87
1,Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) G...,desktop,62.210.80.74,windows_8_1,windows,1488578468,browser,firefox,firefox34.0
2,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,desktop,77.87.101.132,windows_7,windows,1488578472,browser,yandex-browser,yandex-browser17.1.1.1003
3,Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/2...,desktop,216.239.90.19,windows_xp,windows,1488578508,browser,firefox,firefox7.0.1
4,Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-...,smartphone,37.19.72.34,android_6,android,1488578544,mobile_browser,mobile_samsung_browser,mobile_samsung_browser4.0
...,...,...,...,...,...,...,...,...,...
151834,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,desktop,163.172.52.168,windows_7,windows,1489533902,browser,chrome,chrome42.0.2311.152
151835,Mozilla/4.0 (Windows 98; US) Opera 12.16 [en],desktop,141.0.13.187,windows_98,windows,1489534824,browser,opera,opera12.16
151836,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,desktop,85.26.241.169,windows_xp,windows,1489535241,browser,yandex-browser,yandex-browser17.1.1.1005
151837,Opera/9.80 (Windows NT 5.1; Edition Yx) Presto...,desktop,188.255.59.142,windows_xp,windows,1489535339,browser,opera,opera12.16


### Load vectorizers if needed

In [6]:
infiles = ['parted-cls/orders_vectorizer30.joblib.pkl.0']

import os
from sklearn.externals import joblib
from lib.helpers.fileSplitter import cat_files

cat_files(infiles, 'cls/orders_vectorizer30.joblib.pkl')

filename = 'cls/orders_vectorizer30.joblib.pkl'
orders_vectorizer = joblib.load(filename)

infiles = ['parted-cls/values_vectorizer30.joblib.pkl.0']

cat_files(infiles, 'cls/values_vectorizer30.joblib.pkl')

filename = 'cls/values_vectorizer30.joblib.pkl'
values_vectorizer = joblib.load(filename)

In [7]:
from lib.parsers.logParser import LogParser

l_parser = LogParser(log_folder='Logs/')
l_parser.reassign_orders_values(order2_data, values2_data)

full2_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_orders_keys_set, important_values_keys_set, fit_dict=False)

100%|██████████| 151839/151839 [00:00<00:00, 380113.30it/s]
100%|██████████| 151839/151839 [00:00<00:00, 220891.43it/s]
 37%|███▋      | 56444/151839 [00:00<00:00, 564432.46it/s]

Sparse dummy orders shape: 
(151839, 50)


100%|██████████| 151839/151839 [00:00<00:00, 551718.31it/s]


Sparse dummy values shape: 
(151839, 516)


### Get prediction probabilities by groups

### Load classifyers models if needed

In [8]:
infiles = ['parted-cls/device_class_code_logreg_cls.joblib.pkl.0']

cat_files(infiles, 'cls/device_class_code_logreg_cls.joblib.pkl')
clf_device_class_code = joblib.load('cls/device_class_code_logreg_cls.joblib.pkl')

infiles = ['parted-cls/os_family_code_logreg_cls.joblib.pkl.0']

cat_files(infiles, 'cls/os_family_code_logreg_cls.joblib.pkl')
clf_os_family_code = joblib.load('cls/os_family_code_logreg_cls.joblib.pkl')

infiles = ['parted-cls/os_code_logreg_cls.joblib.pkl.0']

cat_files(infiles, 'cls/os_code_logreg_cls.joblib.pkl')
clf_os_code = joblib.load('cls/os_code_logreg_cls.joblib.pkl')

infiles = ['parted-cls/ua_class_code_logreg_cls.joblib.pkl.0']

cat_files(infiles, 'cls/ua_class_code_logreg_cls.joblib.pkl')
clf_ua_class_code = joblib.load('cls/ua_class_code_logreg_cls.joblib.pkl')

infiles = ['parted-cls/ua_family_code_logreg_cls.joblib.pkl.0']

cat_files(infiles, 'cls/ua_family_code_logreg_cls.joblib.pkl')
clf_ua_family_code = joblib.load('cls/ua_family_code_logreg_cls.joblib.pkl')

infiles = ['parted-cls/ua_version_logreg_cls.joblib.pkl.0']

cat_files(infiles, 'cls/ua_version_logreg_cls.joblib.pkl')
clf_ua_version = joblib.load('cls/ua_version_logreg_cls.joblib.pkl')

In [10]:
pp_device_class_code = pd.DataFrame(pp_dc, columns=clf_device_class_code.classes_)
pp_device_class_code

Unnamed: 0,NaN,desktop,game_console,pda,smart_tv,smartphone,tablet,unrecognized,wearable_computer
0,2.343748e-10,0.708663,3.013726e-06,3.485782e-09,1.274577e-11,0.258441,0.032893,6.928042e-09,4.643217e-09
1,6.446619e-03,0.770092,5.819713e-08,1.058579e-08,9.085490e-05,0.167420,0.054458,1.492493e-03,5.505779e-07
2,2.343748e-10,0.708663,3.013726e-06,3.485782e-09,1.274577e-11,0.258441,0.032893,6.928042e-09,4.643217e-09
3,4.017557e-03,0.986732,1.078416e-06,3.620933e-07,2.788864e-06,0.009213,0.000032,7.610044e-07,1.414456e-07
4,3.707530e-13,0.525825,1.320004e-06,3.474406e-08,6.076776e-09,0.435362,0.038812,1.037036e-09,2.361678e-08
...,...,...,...,...,...,...,...,...,...
151834,1.359406e-03,0.832201,6.276515e-07,5.783707e-08,3.973616e-06,0.064116,0.102319,1.837878e-09,5.767337e-08
151835,8.237012e-06,0.630321,2.373205e-12,7.253569e-08,3.556390e-04,0.369306,0.000008,1.339806e-08,6.656212e-07
151836,2.343748e-10,0.708663,3.013726e-06,3.485782e-09,1.274577e-11,0.258441,0.032893,6.928042e-09,4.643217e-09
151837,7.568354e-06,0.724429,2.405617e-09,5.363987e-08,4.860457e-08,0.259576,0.015985,1.505922e-06,3.070709e-07


### Generate new human features based on probability of real class

In [9]:
def single_class_proba(probas, labels, expected_class):
    """
    :param probas: prediction vector
    :param labels: labels of classification
    :param expected_class: label of real truth class answer
    :return: probability of truth class answer
    """
    for i, label in enumerate(labels):
        if label == expected_class:
            return probas[i]
    return 0 # if label doesn't excist

In [12]:
print(main2_df.device_class_code[0])
single_class_proba(pp_dc[0], clf_device_class_code.classes_, main2_df.device_class_code[0])

desktop


0.70866308887369223

In [10]:
def classes_proba(clf, X_test, y_test):
    """
    :param clf: classifyer
    :param X_test: test features
    :param y_test: test labels
    :return: vector of probability of truth class answer
    """
    probas = []
    for i, p_proba in enumerate(clf.predict_proba(X_test)):
        probas.append(single_class_proba(p_proba, clf.classes_, y_test[i]))
    return probas

In [24]:
dcc_feature = classes_proba(clf_device_class_code, full2_sparce_dummy, main2_df.device_class_code.fillna('NaN'))
print("Length: {}".format(len(dcc_feature)))
del(dcc_feature)

Length: 151839


In [11]:
def generate_features(clfs, X_test, y_tests):
    """
    :param clfs: list of classifyer
    :param X_test: test features
    :param y_test: list of test labels
    :return: matrix with new features for Groups Classification
    """
    features = []
    for i, clf in enumerate(clfs):
        feature_vec = classes_proba(clf, X_test, y_tests[i])
        print("Feature was generated. Length: {}".format(len(feature_vec)))
        features.append(feature_vec)
    return features

In [12]:
y_array = [
    main2_df.device_class_code.fillna('NaN'),
    main2_df.os_family_code.fillna('NaN'),
    main2_df.os_code.fillna('NaN'),
    main2_df.ua_class_code.fillna('NaN'),
    main2_df.ua_family_code.fillna('NaN'),
    main2_df.ua_version.fillna('NaN')
]

cls_array = [
    clf_device_class_code,
    clf_os_family_code,
    clf_os_code,
    clf_ua_class_code,
    clf_ua_family_code,
    clf_ua_version
]

human_features = list(map(list, zip(*generate_features(cls_array, full2_sparce_dummy, y_array))))
pd.DataFrame(human_features)

Feature was generated. Length: 151839
Feature was generated. Length: 151839
Feature was generated. Length: 151839
Feature was generated. Length: 151839
Feature was generated. Length: 151839
Feature was generated. Length: 151839


Unnamed: 0,0,1,2,3,4,5
0,0.708663,0.702349,0.436786,0.709285,0.424216,0.307556
1,0.770092,0.694055,0.142846,0.777389,0.357793,0.115639
2,0.708663,0.702349,0.436786,0.709285,0.209185,0.044215
3,0.986732,0.987341,0.982745,0.988570,0.990565,0.992701
4,0.435362,0.477366,0.179448,0.475272,0.131872,0.125350
...,...,...,...,...,...,...
151834,0.832201,0.720730,0.429976,0.876917,0.012524,0.723098
151835,0.630321,0.604475,0.339667,0.629281,0.619640,0.405815
151836,0.708663,0.702349,0.011556,0.709285,0.209185,0.005653
151837,0.724429,0.724433,0.171277,0.724421,0.158602,0.063178


**ua_version predict proba use 12GB memory on 10 LOG !!!**

In [13]:
np.save('df/human_features.npy', np.array(human_features))
print("Human_features saved with size(Bytes): {}".format(os.stat('df/human_features.npy').st_size))

Human_features saved with size(Bytes): 7288352


### Generate new human features based on probability of real class

In [14]:
del(main2_df)
del(main2_data)
del(values2_data)
del(order2_data)
del(full2_sparce_dummy)

main2_bot_data = np.load('df/main2_bot_data.npy').tolist()
values2_bot_data = np.load('df/values2_bot_data.npy').tolist()
order2_bot_data = np.load('df/order2_bot_data.npy').tolist()

main2_bot_df = pd.DataFrame(main2_bot_data)

main2_bot_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,,144.76.67.101,,,1490392820,,,
1,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,desktop,136.243.95.186,windows_7,windows,1490392830,browser,firefox,firefox38.0
2,Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20...,desktop,78.129.185.198,windows_10,windows,1490392870,browser,chrome,chrome57.0.2987.110
3,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,,176.9.47.170,,,1490392892,,,
4,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,desktop,163.172.67.180,windows_xp,windows,1490392931,browser,firefox,firefox7.0.1
...,...,...,...,...,...,...,...,...,...
202326,Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7...,desktop,185.84.189.106,windows_10,windows,1490398773,browser,opera,opera43.0.2442.1144
202327,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,desktop,65.55.212.90,windows_7,windows,1490399136,browser,ie,ie9.0
202328,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,desktop,82.162.142.114,windows_7,windows,1490399387,browser,chrome,chrome56.0.2924.87
202329,Mozilla/4.0 (compatible; MSIE 5.5; Windows NT ...,,176.9.137.118,,,1490399490,,,


In [15]:
l_parser = LogParser(log_folder='Logs/')
l_parser.reassign_orders_values(order2_bot_data, values2_bot_data)

full2_sparce_dummy = l_parser.prepare_data(orders_vectorizer, values_vectorizer, important_orders_keys_set, important_values_keys_set, fit_dict=False)

100%|██████████| 202331/202331 [00:00<00:00, 464312.92it/s]
100%|██████████| 202331/202331 [00:00<00:00, 244631.02it/s]
 30%|███       | 61412/202331 [00:00<00:00, 614117.66it/s]

Sparse dummy orders shape: 
(202331, 50)


100%|██████████| 202331/202331 [00:00<00:00, 636601.16it/s]


Sparse dummy values shape: 
(202331, 516)


In [16]:
y_array = [
    main2_bot_df[1].fillna('NaN'),
    main2_bot_df[4].fillna('NaN'),
    main2_bot_df[3].fillna('NaN'),
    main2_bot_df[6].fillna('NaN'),
    main2_bot_df[7].fillna('NaN'),
    main2_bot_df[8].fillna('NaN')
]

cls_array = [
    clf_device_class_code,
    clf_os_family_code,
    clf_os_code,
    clf_ua_class_code,
    clf_ua_family_code,
    clf_ua_version
]

bot_features = list(map(list, zip(*generate_features(cls_array, full2_sparce_dummy, y_array))))
pd.DataFrame(bot_features)

Feature was generated. Length: 202331
Feature was generated. Length: 202331
Feature was generated. Length: 202331
Feature was generated. Length: 202331
Feature was generated. Length: 202331
Feature was generated. Length: 202331


Unnamed: 0,0,1,2,3,4,5
0,0.179443,0.183773,0.173878,0.177878,0.168663,0.180102
1,0.986897,0.987563,0.960782,0.987100,0.924400,0.981205
2,0.945595,0.931935,0.242893,0.974940,0.207246,0.000000
3,0.179443,0.183773,0.173878,0.177878,0.168663,0.180102
4,0.986732,0.987341,0.982745,0.988570,0.990565,0.992701
...,...,...,...,...,...,...
202326,0.708663,0.702349,0.156352,0.709285,0.059874,0.004188
202327,0.981631,0.979575,0.403425,0.986004,0.920677,0.904861
202328,0.708663,0.702349,0.436786,0.709285,0.424216,0.307556
202329,0.179443,0.183773,0.173878,0.177878,0.168663,0.180102


In [17]:
bf_df = pd.DataFrame(bot_features)

In [22]:
bf_df.loc[np.isnan(bf_df[5])]

Unnamed: 0,0,1,2,3,4,5


In [23]:
del(bf_df)
np.save('df/bot_features.npy', np.array(bot_features))
print("Bot_features saved with size(Bytes): {}".format(os.stat('df/bot_features.npy').st_size))

Bot_features saved with size(Bytes): 9711968


## Superposition

In [24]:
full_sp_features = pd.DataFrame(human_features + bot_features)
full_sp_features

Unnamed: 0,0,1,2,3,4,5
0,0.708663,0.702349,0.436786,0.709285,0.424216,0.307556
1,0.770092,0.694055,0.142846,0.777389,0.357793,0.115639
2,0.708663,0.702349,0.436786,0.709285,0.209185,0.044215
3,0.986732,0.987341,0.982745,0.988570,0.990565,0.992701
4,0.435362,0.477366,0.179448,0.475272,0.131872,0.125350
...,...,...,...,...,...,...
354165,0.708663,0.702349,0.156352,0.709285,0.059874,0.004188
354166,0.981631,0.979575,0.403425,0.986004,0.920677,0.904861
354167,0.708663,0.702349,0.436786,0.709285,0.424216,0.307556
354168,0.179443,0.183773,0.173878,0.177878,0.168663,0.180102


In [26]:
is_bot = pd.DataFrame(index=full_sp_features.index)
is_bot['is_bot'] = False
is_bot.loc[0:len(human_features) - 1,'is_bot'] = True
is_bot.loc[is_bot['is_bot'] == False]

Unnamed: 0,is_bot
151839,False
151840,False
151841,False
151842,False
151843,False
...,...
354165,False
354166,False
354167,False
354168,False


In [27]:
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, make_scorer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(full_sp_features, is_bot.is_bot, test_size=0.33, random_state=42)

In [29]:
%%time

smart_clf = LogisticRegression(C=100)
smart_clf.fit(X_train, y_train)
answer = smart_clf.predict(X_test)

CPU times: user 688 ms, sys: 0 ns, total: 688 ms
Wall time: 696 ms


In [31]:
import os
from sklearn.externals import joblib
from lib.helpers.fileSplitter import split_file

filename = 'cls/groups_logreg_cls.joblib.pkl'
_ = joblib.dump(clf_device_class_code, filename, compress=9)

print("Model saved with size(Bytes): {}".format(os.stat(filename).st_size))

files_count = split_file(filename, 'parted-cls/groups_logreg_cls.joblib.pkl')

print('Splitted in {} files'.format(files_count))

Model saved with size(Bytes): 25570
Splitted in 0 files


In [32]:
compare_frame = pd.DataFrame(y_test)
compare_frame['Predicted'] = answer
compare_frame['Success'] = compare_frame.Predicted == compare_frame.is_bot
compare_frame

Unnamed: 0,is_bot,Predicted,Success
119722,True,False,False
185457,False,False,True
126529,True,False,False
78609,True,False,False
32838,True,False,False
...,...,...,...
294591,False,False,True
218428,False,True,False
295175,False,False,True
288624,False,False,True


In [51]:
compare_frame = compare_frame.reset_index(drop=True)
compare_frame.Success.value_counts()

True     67174
False    49703
Name: Success, dtype: int64

In [52]:
first_kind_error = []
second_kind_error = []
for i, bot_prediction in enumerate(compare_frame.is_bot):
    cur_first_kind_error = False
    cur_second_kind_error = False
    if compare_frame.Predicted[i] and not compare_frame.is_bot[i]:
        cur_first_kind_error = True
    if not compare_frame.Predicted[i] and compare_frame.is_bot[i]:
        cur_second_kind_error = True
    first_kind_error.append(cur_first_kind_error)
    second_kind_error.append(cur_second_kind_error)    

In [54]:
compare_frame['first_kind_error'] = first_kind_error
compare_frame['second_kind_error'] = second_kind_error

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(sum(first_kind_error) / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(sum(second_kind_error) / y_test.shape[0]))

Ошибка первого рода (когда мы принимаем нормального пользователя за бота): 0.020842424086860545
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): 0.4044166089136443


# Not so good. Try Decision Tree over probas

In [56]:
%%time

from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
answer = clf.predict(X_test)

CPU times: user 984 ms, sys: 0 ns, total: 984 ms
Wall time: 1.06 s


In [57]:
compare_frame = pd.DataFrame(y_test)
compare_frame['Predicted'] = answer
compare_frame['Success'] = compare_frame.Predicted == compare_frame.is_bot

compare_frame = compare_frame.reset_index(drop=True)
compare_frame.Success.value_counts()

True     73434
False    43443
Name: Success, dtype: int64

In [58]:
first_kind_error = []
second_kind_error = []
for i, bot_prediction in enumerate(compare_frame.is_bot):
    cur_first_kind_error = False
    cur_second_kind_error = False
    if compare_frame.Predicted[i] and not compare_frame.is_bot[i]:
        cur_first_kind_error = True
    if not compare_frame.Predicted[i] and compare_frame.is_bot[i]:
        cur_second_kind_error = True
    first_kind_error.append(cur_first_kind_error)
    second_kind_error.append(cur_second_kind_error)  
    
compare_frame['first_kind_error'] = first_kind_error
compare_frame['second_kind_error'] = second_kind_error

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(sum(first_kind_error) / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(sum(second_kind_error) / y_test.shape[0]))

Ошибка первого рода (когда мы принимаем нормального пользователя за бота): 0.09034283905302155
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): 0.2813556131659779


## This is better. Try SGD

In [59]:
%%time

from sklearn import linear_model

clf = linear_model.SGDClassifier()
clf.fit(X_train, y_train)
answer = clf.predict(X_test)

CPU times: user 224 ms, sys: 0 ns, total: 224 ms
Wall time: 220 ms


In [60]:
compare_frame = pd.DataFrame(y_test)
compare_frame['Predicted'] = answer
compare_frame['Success'] = compare_frame.Predicted == compare_frame.is_bot

compare_frame = compare_frame.reset_index(drop=True)
compare_frame.Success.value_counts()

True     66887
False    49990
Name: Success, dtype: int64

In [61]:
first_kind_error = []
second_kind_error = []
for i, bot_prediction in enumerate(compare_frame.is_bot):
    cur_first_kind_error = False
    cur_second_kind_error = False
    if compare_frame.Predicted[i] and not compare_frame.is_bot[i]:
        cur_first_kind_error = True
    if not compare_frame.Predicted[i] and compare_frame.is_bot[i]:
        cur_second_kind_error = True
    first_kind_error.append(cur_first_kind_error)
    second_kind_error.append(cur_second_kind_error)  
    
compare_frame['first_kind_error'] = first_kind_error
compare_frame['second_kind_error'] = second_kind_error

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(sum(first_kind_error) / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(sum(second_kind_error) / y_test.shape[0]))

Ошибка первого рода (когда мы принимаем нормального пользователя за бота): 0.0
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): 0.4277146059532671


## Trying Boosted tree

In [62]:
%%time

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train)
answer = clf.predict(X_test)

CPU times: user 2.36 s, sys: 8 ms, total: 2.37 s
Wall time: 2.47 s


In [63]:
compare_frame = pd.DataFrame(y_test)
compare_frame['Predicted'] = answer
compare_frame['Success'] = compare_frame.Predicted == compare_frame.is_bot

compare_frame = compare_frame.reset_index(drop=True)
compare_frame.Success.value_counts()

True     73413
False    43464
Name: Success, dtype: int64

In [64]:
first_kind_error = []
second_kind_error = []
for i, bot_prediction in enumerate(compare_frame.is_bot):
    cur_first_kind_error = False
    cur_second_kind_error = False
    if compare_frame.Predicted[i] and not compare_frame.is_bot[i]:
        cur_first_kind_error = True
    if not compare_frame.Predicted[i] and compare_frame.is_bot[i]:
        cur_second_kind_error = True
    first_kind_error.append(cur_first_kind_error)
    second_kind_error.append(cur_second_kind_error)  
    
compare_frame['first_kind_error'] = first_kind_error
compare_frame['second_kind_error'] = second_kind_error

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(sum(first_kind_error) / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(sum(second_kind_error) / y_test.shape[0]))

Ошибка первого рода (когда мы принимаем нормального пользователя за бота): 0.09390213643402893
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): 0.27797599185468486


In [65]:
%%time

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
clf.fit(X_train, y_train)
answer = clf.predict(X_test)

CPU times: user 15 s, sys: 52 ms, total: 15 s
Wall time: 15.3 s


In [66]:
compare_frame = pd.DataFrame(y_test)
compare_frame['Predicted'] = answer
compare_frame['Success'] = compare_frame.Predicted == compare_frame.is_bot

compare_frame = compare_frame.reset_index(drop=True)
compare_frame.Success.value_counts()

True     70488
False    46389
Name: Success, dtype: int64

In [67]:
first_kind_error = []
second_kind_error = []
for i, bot_prediction in enumerate(compare_frame.is_bot):
    cur_first_kind_error = False
    cur_second_kind_error = False
    if compare_frame.Predicted[i] and not compare_frame.is_bot[i]:
        cur_first_kind_error = True
    if not compare_frame.Predicted[i] and compare_frame.is_bot[i]:
        cur_second_kind_error = True
    first_kind_error.append(cur_first_kind_error)
    second_kind_error.append(cur_second_kind_error)  
    
compare_frame['first_kind_error'] = first_kind_error
compare_frame['second_kind_error'] = second_kind_error

print('Ошибка первого рода (когда мы принимаем нормального пользователя за бота): {}'.format(sum(first_kind_error) / y_test.shape[0]))
print('Ошибка второго рода (когда мы принимаем бота за нормального пользователя): {}'.format(sum(second_kind_error) / y_test.shape[0]))

Ошибка первого рода (когда мы принимаем нормального пользователя за бота): 0.06988543511554883
Ошибка второго рода (когда мы принимаем бота за нормального пользователя): 0.3270190028833731
