In [2]:
import pandas as pd
import numpy as np

import scipy.sparse
import sklearn.feature_extraction

# import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm
import platform

pd.set_option("display.max_rows", 10)
pd.set_option('display.max_columns', 1100)

import os

%pylab inline
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


### Loading data from logs

In [3]:
from logParser import ParseLogsFromFolder

main_data, values_data, order_data = ParseLogsFromFolder('Logs/', 0, 10, only_order=False)

main = pd.DataFrame(main_data)
del(main_data)
# del(values_data)
print('Shape: ' + str(main.shape))
main.head()

  8%|▊         | 10/127 [00:18<03:51,  1.98s/it]

Shape: (289269, 3)


Unnamed: 0,User_Agent,id,ip
0,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,1485900038,95.181.252.91
1,Mozilla/5.0 (compatible; bingbot/2.0; +http://...,1485900047,40.77.167.74
2,Mozilla/5.0 (Linux; Android 4.4.2; Zera S Buil...,1485900079,31.148.3.19
3,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) G...,1485900103,188.162.183.66
4,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,1485900105,37.144.52.103


### Top User Agents Choosing

In [4]:
main_top_100 = main[main.User_Agent.isin(main.User_Agent.value_counts()[:100].index)]

top_ua = main.User_Agent.value_counts()[:100].index.tolist()

### Value dummy preparation

In [5]:
good_value_keys = {'Accept', 'Accept-Encoding', 'Accept-Charset'}
good_values_data = []
for record in tqdm(values_data, mininterval=1.2):
    good_values_data.append(
        {key: value for key, value in record.items() if key in good_value_keys})
del(values_data)

values_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
sparse_values = values_vectorizer.fit_transform(good_values_data).astype(np.int16)

sparse_values = sparse_values[main_top_100.index]
print(sparse_values[:3])


  0%|          | 0/289269 [00:00<?, ?it/s][A
 52%|█████▏    | 151019/289269 [00:01<00:01, 125848.54it/s][A
100%|██████████| 289269/289269 [00:02<00:00, 142854.97it/s][A

  (0, 81)	1
  (0, 337)	1
  (1, 72)	1
  (1, 107)	1
  (2, 72)	1
  (2, 271)	1


In [5]:
sparse_values.shape

(181851, 361)

In [6]:
categorical_values = pd.DataFrame(good_values_data)
categorical_values = categorical_values.iloc[main_top_100.index]
categorical_values.shape

(181851, 3)

### Order dummy Preparation

In [7]:
orders_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
sparse_orders = orders_vectorizer.fit_transform(order_data).astype(np.int8)
del(order_data)

In [8]:
sparse_orders_top_100 = sparse_orders[main_top_100.index]
from itertools import combinations

pairs_dict_list = []
for row_index in tqdm(range(sparse_orders_top_100.shape[0]), mininterval=3):
    pairs_dict = {}
    for pair_first, pair_second in combinations(sparse_orders_top_100[row_index].indices, 2):
        name_first = orders_vectorizer.feature_names_[pair_first]
        name_second = orders_vectorizer.feature_names_[pair_second]
        if sparse_orders_top_100[row_index, pair_first] < sparse_orders_top_100[row_index, pair_second]:
            pairs_dict['{0} < {1}'.format(name_first, name_second)] = 1
        else:
            pairs_dict['{0} < {1}'.format(name_second, name_first)] = 1
    pairs_dict_list.append(pairs_dict)


  0%|          | 0/181851 [00:00<?, ?it/s][A
  1%|          | 1109/181851 [00:03<08:09, 369.21it/s][A
  1%|          | 2129/181851 [00:06<08:19, 359.87it/s][A
100%|██████████| 181851/181851 [09:19<00:00, 324.82it/s]


In [9]:
dummy_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=True, dtype=float)
sparse_orders = dummy_vectorizer.fit_transform(pairs_dict_list).astype(np.int8)
print(type(sparse_orders))
print('Sparse dummy shape: \n{0}'.format(sparse_orders.shape))
print('User Agent shape: \n{0}'.format(main_top_100.User_Agent.shape))

<class 'scipy.sparse.csr.csr_matrix'>
Sparse dummy shape: 
(181851, 1042)
User Agent shape: 
(181851,)


### TF-IDF Features Preparation

In [10]:
tf_idf_vectorizer = sklearn.feature_extraction.text.TfidfTransformer()
sparse_tf_idf = tf_idf_vectorizer.fit_transform(sparse_orders)
print(sparse_tf_idf.shape)
print(type(sparse_tf_idf))

(181851, 1042)
<class 'scipy.sparse.csr.csr_matrix'>


### Principal Components Analysis

In [11]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)

dense_values = sparse_values.todense()
dense_orders = sparse_orders.todense()
dense_tf_idf = sparse_tf_idf.todense()

In [12]:
%%time
pca_orders = pca.fit_transform(dense_orders)
pca_values = pca.fit_transform(dense_values)
pca_tf_idf = pca.fit_transform(dense_tf_idf)

CPU times: user 4min 22s, sys: 26.7 s, total: 4min 49s
Wall time: 45.6 s


Итого у нас есть 7 разных представлений данных:

- sparse_orders - dummy
- sparse_values - dummy
- sparse_tf_idf - real
- pca_orders - real
- pca_values -real
- pca_tf_idf -real
- categorical_values - categorical

На этих данных и их комбинациях хочу попробовать несколько принципиально разных классификаторов:

- **LogisticRegression**(or other linear like Lasso, Ridge) - for all
- **MultinomialNaiveBayes** - for dummy - very fast
- **XGBClassifier** - for real
- **CatBoostClassifier** - for pca - very slow
- **Blanding**

**Порог** положительного класса - 0.024

**Меры качества** - f1-score, accuracy and mean_answers 

### Merging

In [13]:
%%time

from scipy.sparse import hstack

sparse_orders_values = hstack((sparse_orders, sparse_values)).tocsr()
sparse_tfidf_values = hstack((sparse_tf_idf, sparse_values)).tocsr()
sparse_all = hstack((sparse_orders, sparse_tf_idf, sparse_values)).tocsr()



CPU times: user 3.12 s, sys: 71.9 ms, total: 3.19 s
Wall time: 3.19 s


# LogisticRegression

In [16]:
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, make_scorer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing

lb = preprocessing.LabelBinarizer()
lb.fit(top_ua)
y = lb.transform(main_top_100.User_Agent)
y.shape

(181851, 100)

In [19]:
all_data = {
    'sparse_orders': sparse_orders,
    'sparse_values': sparse_values,
    'sparse_tf_idf': sparse_tf_idf,
    'pca_orders': pca_orders,
    'pca_values': pca_values,
    'pca_tf_idf': pca_tf_idf,
    'sparse_orders_values': sparse_orders_values,
    'sparse_tfidf_values': sparse_tfidf_values,
    'sparse_all': sparse_all
}

In [20]:
predicted_cross_val_labels = {}
clf = OneVsRestClassifier(LogisticRegression(C=100))
for data_name, data_view in all_data.items():
    print(data_name)
    %time predicted_cross_val_labels[data_name] = cross_val_predict(clf, data_view, y, method='predict_proba', n_jobs=10)

sparse_orders
CPU times: user 3.25 s, sys: 697 ms, total: 3.95 s
Wall time: 6min 8s
sparse_values
CPU times: user 2.74 s, sys: 703 ms, total: 3.44 s
Wall time: 1min 28s
sparse_tf_idf
CPU times: user 2.87 s, sys: 737 ms, total: 3.6 s
Wall time: 3min 45s
pca_orders
CPU times: user 3.1 s, sys: 717 ms, total: 3.82 s
Wall time: 5min 36s
pca_values
CPU times: user 2.99 s, sys: 695 ms, total: 3.69 s
Wall time: 3min 27s
pca_tf_idf
CPU times: user 3.03 s, sys: 708 ms, total: 3.73 s
Wall time: 4min 15s
sparse_orders_values
CPU times: user 2.97 s, sys: 654 ms, total: 3.63 s
Wall time: 6min 5s
sparse_tfidf_values
CPU times: user 2.68 s, sys: 532 ms, total: 3.22 s
Wall time: 4min 1s
sparse_all
CPU times: user 3.58 s, sys: 662 ms, total: 4.24 s
Wall time: 9min 33s


In [14]:
def get_f1_score(alpha, y, y_hat):
    return f1_score(y, (y_hat > alpha).astype('int'), average='samples')

def mean_answers(alpha, y_cross_val):
    return np.mean([len(list(filter(lambda i: i>alpha, y_obs))) for y_obs in y_cross_val])

def thresholded_score(alpha, y, y_cross_val):
    """
    :param alpha: Threshold
    :param y: y sample
    :param y_cross_val: y from cross_val_predict
    :return: true if at least one predicted User Agent equal true User Agent
    """
    correct_answers = 0
    for y_index, y_label in enumerate(np.argmax(y, axis=1)):
        if y_label in np.argwhere(y_cross_val[y_index] > alpha):
            correct_answers += 1

    return correct_answers / len(y)

In [23]:
logistic_f1_score = {}
logistic_mean_answers = {}
logistic_threshold_score = {}
threshold = 0.024
for name, predicted_labels in tqdm(predicted_cross_val_labels.items()):
    logistic_f1_score[name] = get_f1_score(threshold, y, predicted_labels)
    logistic_mean_answers[name] = mean_answers(threshold, predicted_labels)
    logistic_threshold_score[name] = thresholded_score(threshold, y, predicted_labels)

100%|██████████| 9/9 [02:00<00:00, 13.31s/it]


In [24]:
logistic_f1_score

{'pca_orders': 0.54281028891275374,
 'pca_tf_idf': 0.52263168655588799,
 'pca_values': 0.41769524752285597,
 'sparse_all': 0.61512667257857145,
 'sparse_orders': 0.57264899723701346,
 'sparse_orders_values': 0.61449762404110297,
 'sparse_tf_idf': 0.57175708390988234,
 'sparse_tfidf_values': 0.61501543657590452,
 'sparse_values': 0.42241808004754056}

In [25]:
logistic_mean_answers

{'pca_orders': 5.1480222819781032,
 'pca_tf_idf': 5.2009447294763298,
 'pca_values': 6.6392981066917418,
 'sparse_all': 4.6049238112520694,
 'sparse_orders': 5.033263495938983,
 'sparse_orders_values': 4.6126884097420415,
 'sparse_tf_idf': 5.0067142880710032,
 'sparse_tfidf_values': 4.6131283303363739,
 'sparse_values': 6.6436038295087734}

In [26]:
logistic_threshold_score

{'pca_orders': 0.8951009342813622,
 'pca_tf_idf': 0.877493112493195,
 'pca_values': 0.883063607018933,
 'sparse_all': 0.9288098498221071,
 'sparse_orders': 0.9053180900847397,
 'sparse_orders_values': 0.9289638220301235,
 'sparse_tf_idf': 0.9059009848722306,
 'sparse_tfidf_values': 0.9302945818279801,
 'sparse_values': 0.8861210551495455}

На данном пороге лучшие результаты, когда в выборке есть sparse_values

In [44]:
np.hstack((pca_orders, pca_values)).shape

(181851, 40)

In [19]:
from xgboost import XGBClassifier

xgb_params = {
    'learning_rate': 1,
    'max_depth': 3,
    'objective': 'reg:linear',
}

xgb_clf = OneVsRestClassifier(XGBClassifier(**xgb_params, n_estimators=10))

xgb_labels = cross_val_predict(
    xgb_clf, 
    np.hstack((pca_orders, pca_values)),
    y, method='predict_proba')

In [None]:
from xgboost import XGBClassifier

xgb_params = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'silent': 1
}

xgb_clf = OneVsRestClassifier(XGBClassifier(**xgb_params, n_estimators=100))
xgb_labels = cross_val_predict(
    xgb_clf, 
    np.hstack((pca_orders, pca_values)),
    y, method='predict_proba', n_jobs=10)

