In [1]:
import pandas as pd
import numpy as np
import random
from load_personality_data import get_data
from load_training_data import load_training_data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [2]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [3]:
def transform_age(x):
    if x<18:
        return 1
    if x<22:
        return 2
    if x<25:
        return 3
    if x<26:
        return 4
    if x<27:
        return 5
    if x<28:
        return 6
    if x<29:
        return 7
    if x<30:
        return 8
    if x<31:
        return 9
    if x<36:
        return 10
    if x<41:
        return 11
    if x<51:
        return 12
    return 13

In [4]:
def check(num):
    if len(num) != 0:
        return int(num)
    else:
        return np.nan

In [45]:
# 数据导入
data = load_training_data()
del data['language']
data['age'] = data['age'].apply(lambda x:transform_age(x))

labelencoder = LabelEncoder()
data['genre'] = labelencoder.fit_transform(data['genre'])
data['lang'] = labelencoder.fit_transform(data['lang'])
data['city_level'] = data['city_level'].apply(lambda x:check(x))
data['sid'] = labelencoder.fit_transform(data['sid'])
data['weight'] = np.where(data['label']==0,1,2)
data.dropna(inplace=True)

In [6]:
# 搭建交叉特征
combined_filt_data = data.copy()
columns = combined_filt_data.columns
for column in columns:
    if column == 'uin' or column == 'label':
        continue
    else:
        combined_filt_data[column] = combined_filt_data[column].apply(lambda x:str(x))

left = ['sid','singer_id','genre','lang']
right = ['age','gender','city_id','prof']
labelencoder = LabelEncoder()
for c1 in left:
    for c2 in right:
        combined_filt_data[c1+'_'+c2] = combined_filt_data[c1] + '_' + combined_filt_data[c2]
        combined_filt_data[c1+'_'+c2] = labelencoder.fit_transform(combined_filt_data[c1+'_'+c2])

In [72]:
# 搭建性格特征
personality_data = get_data(change=False,pca=False)
personality_data.head()

Unnamed: 0,uin,social,family,friend,humans,affect,posemo,negemo,anx,anger,...,time,work,achieve,leisure,home,money,relig,death,assent,nonfl
0,103000,6.82,1.62,0.24,1.58,6.78,4.67,1.3,0.24,0.41,...,5.36,2.4,1.79,1.58,0.57,0.89,0.16,0.04,6.09,0.16
1,132000,8.68,0.35,0.14,1.6,10.56,6.25,1.6,0.35,0.07,...,3.75,2.01,2.08,0.62,0.0,0.0,0.21,0.14,5.0,0.07
2,1210000,6.3,0.48,0.35,0.73,7.76,3.77,2.98,0.73,0.73,...,6.16,1.49,1.11,3.01,0.76,0.8,0.38,0.28,6.48,0.14
3,2512000,8.15,0.52,0.78,3.1,8.8,5.69,2.33,0.39,0.65,...,8.28,1.03,1.03,1.68,0.91,0.39,0.0,0.0,7.12,1.16
4,2867000,9.78,0.7,0.28,3.39,7.63,3.08,3.01,0.4,0.64,...,4.54,1.72,1.55,0.59,0.16,0.43,0.19,0.45,4.95,1.09


In [73]:
personality = pd.merge(data, personality_data, how='inner', on=['uin'])

In [46]:
# 搭建训练数据，不需要交叉特征
training_data = data.copy()
del training_data['uin']
label = training_data['label']
del training_data['label']
#del training_data['sid']
#del training_data['city_level']
del training_data['singer_id']
del training_data['genre']
del training_data['lang']
del training_data['city_id']

In [221]:
# 搭建训练数据，交叉特征
#training_data = combined_filt_data[['uin','label','sid_age','sid_gender','sid_city_id','sid_prof','weight']]
training_data = combined_filt_data[['uin','label','singer_id_age','singer_id_gender','singer_id_city_id',
                                    'singer_id_prof','genre_age','genre_gender','genre_city_id',
                                    'genre_prof','lang_age','lang_gender','lang_city_id','lang_prof','weight']]
del training_data['uin']
label = training_data['label']
del training_data['label']

In [74]:
# 搭建训练数据，不交叉特征，包含人格特征
training_data = personality.copy()
del training_data['uin']
label = training_data['label']
del training_data['label']
#del training_data['sid']
del training_data['singer_id']
del training_data['genre']
del training_data['lang']
del training_data['city_id']
#del training_data['gender']
#del training_data['prov_id']
#del training_data['city_level']

In [75]:
training_data.head()

Unnamed: 0,age,city_level,gender,prof,prov_id,sid,weight,social,family,friend,...,time,work,achieve,leisure,home,money,relig,death,assent,nonfl
0,4,3.0,2,3,320000,4275,1,6.94,0.52,0.0,...,6.25,0.35,1.22,0.87,1.04,0.0,0.0,0.17,5.73,0.69
1,4,3.0,2,3,320000,4243,1,6.94,0.52,0.0,...,6.25,0.35,1.22,0.87,1.04,0.0,0.0,0.17,5.73,0.69
2,4,3.0,2,3,320000,3181,1,6.94,0.52,0.0,...,6.25,0.35,1.22,0.87,1.04,0.0,0.0,0.17,5.73,0.69
3,4,3.0,2,3,320000,2093,2,6.94,0.52,0.0,...,6.25,0.35,1.22,0.87,1.04,0.0,0.0,0.17,5.73,0.69
4,4,3.0,2,3,320000,1516,1,6.94,0.52,0.0,...,6.25,0.35,1.22,0.87,1.04,0.0,0.0,0.17,5.73,0.69


In [76]:
X_train, X_test, y_train, y_test = train_test_split(training_data, 
                                                    label, 
                                                    random_state=10, test_size=0.2)

In [77]:
params  =  {
    'objective':'binary',
    'boosting_type':'gbdt',
    'metric':{'l1','auc'},
    'num_leaves':31,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 3,
    'num_threads': 2,
    'verbose':50
}

In [78]:
weight = X_train['weight']

In [79]:
del X_train['weight']
del X_test['weight']

In [272]:
"""#nums = [1.25,1.5,1.75,2,2.25,2.5,3]
weight = np.array(weight)
weight = np.where(weight==1,1,3)"""

In [140]:
"""max_roc = 0
for num in nums:
    weight = np.where(weight==1,1,num)
    lgb_train = lgb.Dataset(X_train, y_train,categorical_feature=['city_id','gender','prov_id',
                                                              'singer_id','genre','lang','prof'],weight=weight)
    bst = lgb.train(
        params, lgb_train, num_boost_round=500
    )
    y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
    roc = roc_auc_score(y_test, y_pred)
    print(roc, num)
    if roc > max_roc:
        max_roc = roc
        best_weight = num"""



0.8208604435516527 1.25
0.8199125945189467 1.5
0.8213117606326112 1.75
0.8215293935260736 2
0.8200525343800982 2.25
0.8203731310339712 2.5
0.82148137675809 3


In [80]:
X_train.head()

Unnamed: 0,age,city_level,gender,prof,prov_id,sid,social,family,friend,humans,...,time,work,achieve,leisure,home,money,relig,death,assent,nonfl
15156,12,4.0,2,3,230000,2284,15.41,1.5,0.71,4.23,...,3.69,0.91,1.31,0.77,0.28,0.82,0.26,0.26,6.27,0.54
68558,10,4.0,1,4,340000,46,5.43,0.0,0.15,0.9,...,4.68,1.06,1.36,0.6,0.15,0.15,0.45,0.6,4.68,1.51
30497,5,5.0,1,3,530000,245,9.56,1.24,0.31,2.02,...,5.6,2.02,2.1,1.24,0.31,0.23,0.23,0.08,4.98,1.24
78746,6,3.0,2,3,330000,525,8.58,0.47,0.23,2.15,...,4.5,1.52,1.89,0.72,0.2,0.32,0.38,0.15,4.8,0.66
33558,3,3.0,2,3,440000,3232,4.89,0.34,0.23,1.38,...,4.66,1.53,5.19,0.8,0.27,0.27,0.11,0.11,7.87,0.53


In [81]:
"""lgb_train = lgb.Dataset(X_train, y_train,categorical_feature=['singer_id_age','singer_id_gender','singer_id_city_id',
                                    'singer_id_prof','genre_age','genre_gender','genre_city_id',
                                    'genre_prof','lang_age','lang_gender','lang_city_id','lang_prof'],weight=weight)"""

lgb_train = lgb.Dataset(X_train, y_train,categorical_feature=['city_level','gender','prof','prov_id','sid'],weight=weight)

In [82]:
bst = lgb.train(
        params, lgb_train, num_boost_round=500
    )



In [83]:
print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))

sid: 322754.62
prov_id: 27053.73
age: 7919.04
social: 4798.37
humans: 4499.14
nonfl: 4417.12
family: 4394.27
leisure: 4364.74
motion: 4329.62
cause: 4193.22
certain: 3981.40
space: 3978.32
hear: 3896.18
excl: 3880.62
ingest: 3879.48
affect: 3860.85
tentat: 3779.82
achieve: 3640.50
percept: 3632.48
insight: 3624.11
bio: 3623.23
assent: 3606.22
money: 3599.84
sad: 3544.51
sexual: 3418.56
posemo: 3377.99
home: 3287.22
relativ: 3276.78
incl: 3257.37
work: 3240.42
relig: 3223.61
discrep: 3214.99
feel: 3196.49
negemo: 3157.29
time: 3133.49
body: 2998.34
friend: 2991.77
see: 2959.62
inhib: 2813.27
cogmech: 2790.49
anger: 2763.86
health: 2710.45
death: 2630.36
anx: 2612.39
prof: 1655.60
gender: 539.13
city_level: 518.39


In [84]:
train_pred = bst.predict(X_train, num_iteration=bst.best_iteration)
print('The roc of prediction is:', roc_auc_score(y_train, train_pred))

The roc of prediction is: 0.9574066883756567


In [85]:
predicted_labels = np.where(train_pred<0.5, 0, 1)
print(classification_report(y_train, predicted_labels))

             precision    recall  f1-score   support

          0       0.95      0.97      0.96    105215
          1       0.80      0.69      0.74     18315

avg / total       0.93      0.93      0.93    123530



In [86]:
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
print('The roc of prediction is:', roc_auc_score(y_test, y_pred))

The roc of prediction is: 0.8441043570429491


In [122]:
predicted_labels = np.where(y_pred<0.5, 0, 1)

In [123]:
print(classification_report(y_test, predicted_labels))

             precision    recall  f1-score   support

          0       0.89      0.95      0.92     26267
          1       0.56      0.36      0.44      4616

avg / total       0.84      0.86      0.85     30883



In [88]:
test_data = pd.concat([X_test, y_test],axis=1)

In [90]:
test_data_1 = test_data[test_data['label']==1]
test_data_0 = test_data[test_data['label']==0]

In [91]:
test_data_1.shape

(4616, 21)

In [92]:
test_data_0.shape

(26267, 21)

In [93]:
test_data_select = test_data_0.sample(n=5000)

In [94]:
test_data = pd.concat([test_data_1,test_data_select])

In [95]:
test_data.shape

(9616, 21)

In [96]:
test_data = test_data.sample(frac=1)

In [101]:
X_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]

Unnamed: 0,age,city_id,genre,prof,singer_id,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14
3583,11,440100,13,6,19716,3.298552,-0.684922,-0.716829,-0.258547,-0.666272,-0.539948,0.237366,-0.210333,-0.600441,0.445452,1.066438,-0.009677,-0.393473,0.613722,-0.005784
122366,10,441200,13,41,143,-3.380459,0.993991,-0.547664,0.906417,0.725162,-0.517660,0.388247,-1.025798,-0.339080,-0.627372,0.750035,0.409437,0.285888,-0.210766,0.169868
105950,8,210700,13,3,35148,-4.399628,-2.216779,-6.326351,-0.751023,-2.684448,-0.082673,-0.258196,-3.699046,1.123012,-0.352663,-1.662055,0.391667,0.859608,-0.103186,0.799759
30481,3,445100,13,3,199515,-0.815888,1.431640,-1.466319,2.702622,2.040599,1.513200,-0.038237,-0.584392,-1.970420,-0.169505,-1.273290,-0.381776,-0.271037,-0.201025,0.934328
30371,10,460100,13,3,91758,0.279126,-1.440129,3.483306,3.406119,0.500856,1.258572,-2.123400,-0.923099,0.895969,1.210036,0.271796,-2.622169,-0.259659,-0.040840,0.175451
7077,5,330700,18,3,6275,2.719739,-1.913837,1.127086,-0.898331,0.369197,1.893660,-0.033155,0.197143,0.460714,-0.978164,0.514097,1.346538,-0.670238,0.303700,-0.386730
144970,11,330700,13,3,159,-5.189964,2.092950,-1.178454,0.672258,-0.719962,-0.255055,-0.911921,0.530664,0.324909,-0.240846,0.072339,0.256776,-0.209096,0.046033,0.097787
29113,10,310000,13,3,91758,1.286536,-1.779981,-0.436376,-1.217891,-1.474337,0.859198,-1.424648,-0.785581,0.268083,0.887120,-0.209368,-0.620920,1.392861,0.508517,0.761923
40,8,110000,13,3,4558,-4.681552,0.128495,-3.319232,3.197277,0.491770,2.497163,-1.125810,-0.739740,-1.887910,1.975167,-0.382667,-0.336279,-0.435429,-0.055929,-0.106791
138691,11,110000,13,5,4685,-1.082507,2.264280,0.076046,1.160338,-2.836617,-0.868789,-0.059886,-1.240410,0.871325,-1.095959,-0.186839,-1.114455,-0.058228,1.540489,1.280158


In [103]:
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
print('The roc of prediction is:', roc_auc_score(y_test, y_pred))

The roc of prediction is: 0.8437491117850954


In [105]:
predicted_labels = np.where(y_pred<0.5, 0, 1)
print(classification_report(y_test, predicted_labels))

             precision    recall  f1-score   support

          0       0.64      0.95      0.76      5000
          1       0.88      0.41      0.56      4616

avg / total       0.75      0.69      0.66      9616



In [108]:
len(personality.uin.unique())

14126