In [321]:
import numpy as np
import pandas as pd
import modutils
import time, datetime
import sklearn, sklearn.metrics, sklearn.decomposition, sklearn.ensemble, sklearn.linear_model
import collections
import matplotlib.pyplot as plt
import seaborn
from collections import Counter

dev_batch_format = 'D:/Jupyter/DataSets/prv/mobnet_batch{0:03d}.npy'
dev_batches = range(101)

def read_batches(batches):
    res = []
    for x in batches:
        try:
            res.append(np.load(dev_batch_format.format(x)))
        except:
            print('failed load')
    return np.vstack(res)

def split_full(full):
    return full[:, 2:], full[:,0], full[:,1]

def transform_models(X, models):
    tmp = [m.predict_proba(X)[:,1] for m in models]
    return np.vstack([np.log(x / (1-x)) for x in tmp]).T

In [280]:
dev0_X, dev0_YM, dev0_YF = split_full(read_batches(dev_batches))
dev_X =dev0_X[~np.isnan(dev0_YF)]
dev_YM =dev0_YM[~np.isnan(dev0_YF)]
dev_YF =dev0_YF[~np.isnan(dev0_YF)]
rndval = np.random.uniform(size=len(dev_X))
rndcut = 0.9
train_X = dev_X[rndval < rndcut]
train_YF = dev_YF[rndval < rndcut]
train_YM = dev_YM[rndval < rndcut]
test_X = dev_X[rndval >= rndcut]
test_YF = dev_YF[rndval >= rndcut]
test_YM = dev_YM[rndval >= rndcut]

In [281]:
%%time
m_l2r_f = sklearn.linear_model.LogisticRegression(penalty='l2', C=0.001).fit(train_X, train_YF)
print(sklearn.metrics.roc_auc_score(train_YF, m_l2r_f.predict_proba(train_X)[:,1]) * 2 - 1,\
    sklearn.metrics.roc_auc_score(test_YF, m_l2r_f.predict_proba(test_X)[:, 1]) * 2 - 1)

0.823859486835 0.798403255299
Wall time: 19 s


In [282]:
%%time
m_l1r_f = sklearn.linear_model.LogisticRegression(penalty='l1', C=0.05).fit(train_X, train_YF)
print(sklearn.metrics.roc_auc_score(train_YF, m_l1r_f.predict_proba(train_X)[:,1]) * 2 - 1,\
    sklearn.metrics.roc_auc_score(test_YF, m_l1r_f.predict_proba(test_X)[:, 1]) * 2 - 1)

0.825619559525 0.795129070197
Wall time: 47.2 s


In [283]:
models0 = [m_l1r_f, m_l2r_f]
m_lr_f = sklearn.linear_model.LogisticRegression(C=1.0).fit(transform_models(train_X, models0), train_YF)

In [284]:
sklearn.metrics.roc_auc_score(train_YF, m_lr_f.predict_proba(transform_models(train_X, models0))[:,1]) * 2 - 1,\
sklearn.metrics.roc_auc_score(test_YF, m_lr_f.predict_proba(transform_models(test_X, models0))[:,1]) * 2 - 1

(0.82544560685035706, 0.79327306909783935)

In [285]:
dev0_PF = m_l2r_f.predict_proba(dev0_X)[:,1]

In [243]:
test_YF[test_PF>0.3].mean()

0.36307053941908712

In [213]:
%%time
m_gb_f = sklearn.ensemble.GradientBoostingClassifier(max_depth=3, min_samples_leaf=3, n_estimators=100)
m_gb_f.fit(train_X, train_YF)

Wall time: 21min 8s


In [214]:
sklearn.metrics.roc_auc_score(train_YF, m_gb_f.predict_proba(train_X)[:,1]) * 2 - 1,\
sklearn.metrics.roc_auc_score(test_YF, m_gb_f.predict_proba(test_X)[:,1]) * 2 - 1

(0.86652417954866623, 0.75719303596030363)

In [81]:
%%time
m_rf_f = sklearn.ensemble.RandomForestClassifier(max_depth=3, min_samples_leaf=3, n_estimators=500)
m_rf_f.fit(train_X, train_YF)

Wall time: 1min 7s


In [82]:
sklearn.metrics.roc_auc_score(train_YF, m_rf_f.predict_proba(train_X)[:,1]) * 2 - 1,\
sklearn.metrics.roc_auc_score(test_YF, m_rf_f.predict_proba(test_X)[:,1]) * 2 - 1

(0.78752977891760789, 0.57370618277697094)

In [165]:
np.array(range(len(test_PF)))[(test_PF>0.4)&(test_YF==0)]

array([   0,    7,   28,  100,  245,  249,  257,  287,  326,  331,  358,
        383,  400,  458,  471,  495,  503,  515,  520,  562,  629,  709,
        717,  719,  722,  776,  777,  780,  842,  899,  902,  940,  964,
        968,  980, 1034, 1041, 1060, 1087, 1143, 1185, 1212, 1217, 1294,
       1317, 1322, 1364, 1378, 1452, 1460, 1499, 1525, 1535, 1585, 1605,
       1728, 1782, 1795, 1830, 1842, 1862, 1928, 1957, 1980, 2008, 2016,
       2053, 2077, 2104, 2123, 2164, 2173, 2196, 2201, 2204, 2230, 2290,
       2301, 2336, 2338, 2352, 2398, 2495, 2502, 2546])

In [166]:
all_X,all_YM,all_YF = split_full(read_batches(range(40)))

In [167]:
all_PF = m_lr_f.predict_proba(transform_models(all_X, models0))[:,1]

In [168]:
sklearn.metrics.roc_auc_score(all_YF, all_PF)*2-1

0.78821123786037717

In [287]:
src = pd.read_csv('D:/Jupyter/DataSets/prv/raw_image.csv', delimiter=';', encoding='cp1251', quotechar='"')

In [310]:
src['p_f'] = dev0_PF
src['score_f'] = -np.log(dev0_PF / (1-dev0_PF))*36+533
src['score_f10'] = np.clip(np.floor(src.score_f / 10).astype(np.int32) * 10, 500, 800)
src['score_f50'] = np.clip(np.floor(src.score_f / 50).astype(np.int32) * 50, 500, 800)

In [366]:
src.groupby('score_f10').agg({'trg_f':['mean', 'sum', 'count'], 'trg_m':['mean', 'sum', 'count'], 'score_f':'count', 'p_f':'mean'})

Unnamed: 0_level_0,trg_f,trg_f,trg_f,trg_m,trg_m,trg_m,score_f,p_f
Unnamed: 0_level_1,mean,sum,count,mean,sum,count,count,mean
score_f10,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
500,0.60514,259.0,428,0.081776,35.0,428,541,0.76187
510,0.657407,142.0,216,0.125,27.0,216,263,0.618652
520,0.545139,157.0,288,0.170139,49.0,288,353,0.553265
530,0.504762,212.0,420,0.202381,85.0,420,513,0.485465
540,0.440162,217.0,493,0.249493,123.0,493,605,0.416359
550,0.42576,238.0,559,0.293381,164.0,559,724,0.352597
560,0.35743,267.0,747,0.331995,248.0,747,904,0.291256
570,0.286349,258.0,901,0.407325,367.0,901,1085,0.237214
580,0.238647,247.0,1035,0.44058,456.0,1035,1281,0.190281
590,0.180382,217.0,1203,0.525353,632.0,1203,1496,0.151929


In [335]:
def gather_stat(df, cutoff=0.01):
    tmp = dict(Counter(sum(df.tags.map(lambda x: x.split(';') if type(x) is str else []).values.tolist(), [])))
    tsum = sum(tmp.values())
    return sorted({k:v/tsum for (k,v) in tmp.items() if v > cutoff*tsum}.items(), key = lambda x: x[1], reverse=True)

In [344]:
%%time
tmp = [(x, gather_stat(src[src.score_f50==x])) for x in range(500, 801, 50)]

Wall time: 7.45 s


In [365]:
src[src.tags.map(lambda x: type(x) is not str) & (src.score_f < 400)].head()

Unnamed: 0,rid,rdt,req_rid,img_id,img_url,user_id,user_name,user_url,upload_dt,tags,views,rating,cmts,local_url,trg_m,trg_f,p_f,score_f,score_f10,score_f50
44729,44730,2018-03-28 23:40:40.907000000,70507,293009,http://porevo.win/pics.php?q=gioQhSbCgKFKCD5mV...,152049,Riser100,http://porevo.win/index.php?action=user&id=152049,2008-01-13,,1385,10,46,D:\Jupyter\Datasets\prv\u0152049_000293009.jpg,,,0.984921,382.547589,500,500


In [None]:
#gini = 80 in case of logistic regression