In [1]:
import lightgbm as lgb
import os
import math
import pandas as pd
import numpy as np
import random
import gc
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder
import cmath

np.random.seed(2019)
random.seed(2019)
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 280)
pd.set_option('display.max_colwidth', 150)
data_path = '/data/workspace/kimi/tencent_ads/2020/dataset'
preprocess_path = 'preprocess'

In [2]:
lgb_df = pd.read_pickle(f'output/lgb_output.pkl')
print(lgb_df)

        user_id  predicted_age                                                                                                                                            age_percent  label_age
0        720001              2  [0.009908854270726448, 0.14613597104169695, 0.24443525882717854, 0.22441327000712435, 0.21186224201232762, 0.10598581377117332, 0.0470795637933028...          2
1        720002              1  [0.01172991836721728, 0.45031737205034816, 0.3703733788360391, 0.13413562259865827, 0.025048104984771384, 0.004839598806303613, 0.0030107443163341...          1
2        720003              1  [0.2680329828411943, 0.5789438128660922, 0.13346174479684525, 0.016228518917983124, 0.0029288374766083163, 0.0003096284588735656, 7.26853065721501...          1
...         ...            ...                                                                                                                                                    ...        ...
179997   899998              3  [0.

In [3]:
precision_score(lgb_df['predicted_age'].values, lgb_df['label_age'].values,average='micro')


0.44448333333333334

In [4]:
lgb_true_df = lgb_df[lgb_df.predicted_age == lgb_df.label_age]
print(lgb_true_df)

        user_id  predicted_age                                                                                                                                            age_percent  label_age
0        720001              2  [0.009908854270726448, 0.14613597104169695, 0.24443525882717854, 0.22441327000712435, 0.21186224201232762, 0.10598581377117332, 0.0470795637933028...          2
1        720002              1  [0.01172991836721728, 0.45031737205034816, 0.3703733788360391, 0.13413562259865827, 0.025048104984771384, 0.004839598806303613, 0.0030107443163341...          1
2        720003              1  [0.2680329828411943, 0.5789438128660922, 0.13346174479684525, 0.016228518917983124, 0.0029288374766083163, 0.0003096284588735656, 7.26853065721501...          1
...         ...            ...                                                                                                                                                    ...        ...
179996   899997              2  [0.

In [5]:
label_df = pd.read_csv(f'{data_path}/train_preliminary/user.csv')
print(label_df)

        user_id  age  gender
0             1    4       1
1             2   10       1
2             3    7       2
...         ...  ...     ...
899997   899998    4       2
899998   899999    3       1
899999   900000    3       2

[900000 rows x 3 columns]


In [11]:
transform_df = pd.read_pickle(f'output/transform_valid_ret.pkl')
transform_df = transform_df.rename(columns={'predicted_age':'trans_predicted_age','age_percent':'trans_age_percent'})
print(transform_df)

        user_id  trans_predicted_age                                                                                                                       trans_age_percent
0        720001                    3        [0.019144528, 0.20491104, 0.30700263, 0.26462606, 0.11737743, 0.058445495, 0.01969487, 0.0040292162, 0.0026491152, 0.0021196164]
1        720002                    3  [0.012328157, 0.34160623, 0.43078998, 0.17672162, 0.031835053, 0.0048781997, 0.0014903151, 0.00019775821, 9.150131e-05, 6.1132494e-05]
2        720003                    2  [0.10638821, 0.7450692, 0.13821691, 0.009317869, 0.0008014458, 0.00012383069, 6.91669e-05, 7.741359e-06, 1.6611197e-06, 3.9884535e-06]
...         ...                  ...                                                                                                                                     ...
179997   899998                    4     [0.0038891477, 0.069788955, 0.3183199, 0.46571296, 0.121309556, 0.01516629, 0.0044962084, 0.00

In [16]:
transform_df = transform_df.merge(label_df,on='user_id')
transfrom_true_df =  transform_df[transform_df.trans_predicted_age == transform_df.age]
print(transfrom_true_df)

        user_id  trans_predicted_age                                                                                                                       trans_age_percent  age  gender
0        720001                    3        [0.019144528, 0.20491104, 0.30700263, 0.26462606, 0.11737743, 0.058445495, 0.01969487, 0.0040292162, 0.0026491152, 0.0021196164]    3       2
2        720003                    2  [0.10638821, 0.7450692, 0.13821691, 0.009317869, 0.0008014458, 0.00012383069, 6.91669e-05, 7.741359e-06, 1.6611197e-06, 3.9884535e-06]    2       1
3        720004                    3        [0.002538805, 0.050856013, 0.28240168, 0.26022783, 0.23981091, 0.11722591, 0.04000003, 0.005330922, 0.0012713585, 0.00033655117]    3       1
...         ...                  ...                                                                                                                                     ...  ...     ...
179993   899994                    4        [0.030702211, 0.1347148, 0

In [17]:
inner_df = transfrom_true_df.merge(lgb_true_df,on='user_id',how='inner')
print(inner_df)

       user_id  trans_predicted_age                                                                                                                        trans_age_percent  age  gender  predicted_age  \
0       720001                    3         [0.019144528, 0.20491104, 0.30700263, 0.26462606, 0.11737743, 0.058445495, 0.01969487, 0.0040292162, 0.0026491152, 0.0021196164]    3       2              2   
1       720003                    2   [0.10638821, 0.7450692, 0.13821691, 0.009317869, 0.0008014458, 0.00012383069, 6.91669e-05, 7.741359e-06, 1.6611197e-06, 3.9884535e-06]    2       1              1   
2       720004                    3         [0.002538805, 0.050856013, 0.28240168, 0.26022783, 0.23981091, 0.11722591, 0.04000003, 0.005330922, 0.0012713585, 0.00033655117]    3       1              2   
...        ...                  ...                                                                                                                                      ...  ...     ..

In [19]:
total_df = lgb_df.merge(transform_df,on='user_id',how='left')
print(total_df)

        user_id  predicted_age                                                                                                                                            age_percent  label_age  trans_predicted_age  \
0        720001              2  [0.009908854270726448, 0.14613597104169695, 0.24443525882717854, 0.22441327000712435, 0.21186224201232762, 0.10598581377117332, 0.0470795637933028...          2                    3   
1        720002              1  [0.01172991836721728, 0.45031737205034816, 0.3703733788360391, 0.13413562259865827, 0.025048104984771384, 0.004839598806303613, 0.0030107443163341...          1                    3   
2        720003              1  [0.2680329828411943, 0.5789438128660922, 0.13346174479684525, 0.016228518917983124, 0.0029288374766083163, 0.0003096284588735656, 7.26853065721501...          1                    2   
...         ...            ...                                                                                                      

In [None]:
#total_df['total_age'] = total_df.apply(lambda row: np.sqrt(row['age_percent']  * row['trans_age_percent']),axis=1)
#total_df['total_age'] = total_df.apply(lambda row: math.sqrt(row['age_percent'] * row['trans_age_percent']),axis=1)
total_df['total_age'] = total_df['age_percent'] * 0.5  + total_df['trans_age_percent'] * 0.5

#print(total_df[['age_percent','trans_predicted_age','total_age']])
print(total_df['age_percent'])
print(total_df['trans_age_percent'])
print(total_df['total_age'])

In [21]:
y_pred_percent = total_df['total_age'].values
y_pred = y_pred_percent.copy()
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)
y_result = []
for i in y_pred:
    y_result.append(int(i.tolist().index(1) ))


precision_score(y_result, total_df['label_age'].values,average='micro')



[array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])
 array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]) ...
 array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])
 array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])
 array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])]


0.4486