In [1]:
import os
import json
import argparse
import numpy as np
import pandas as pd
from collections import Counter
import pickle

from scipy.stats import skew, kurtosis
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

In [2]:
account_static = pd.read_csv('./data/账户静态信息.csv')
account_trade = pd.read_csv('./data/账户交易信息.csv')
y_train = pd.read_csv('./data/训练集标签.csv')
y_test = pd.read_csv('./data/test_dataset.csv')

In [3]:
account_trade['jyje_label'] = pd.qcut(account_trade['jyje'], 10, labels=range(10)).astype(int)

In [4]:
account_trade['jyje_label'] = account_trade['jyje_label'] * account_trade['jdbj'].apply(lambda x: -1 if x == 0 else 1)

In [5]:
account_trade['jyje_label'] = account_trade['jyje_label'].astype(str)

In [6]:
account_trade['dt'] = account_trade['jyrq'] + ' ' + account_trade['jysj']

In [7]:
col_name = 'dfhh'

In [8]:
w2v_size = 16
tfidf_size = 8
cnt_size = 8

In [9]:
def get_w2v_mean(sentences):
    emb_matrix = list()
    vec = list()
    for w in sentences.split():
        if w in w2v_model.wv:
            vec.append(w2v_model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * w2v_model.vector_size)
    return emb_matrix

In [10]:
def get_tfidf_svd(sentence, n_components=tfidf_size):
    X_tfidf = tfv.transform(sentence)
    X_svd = tfidf_svd.transform(X_tfidf)
    return X_svd

In [11]:
def get_cnt_svd(sentence, n_components=cnt_size):
    X_cnt = cv.transform(sentence)
    X_svd = cnt_svd.transform(X_cnt)
    return X_svd

In [12]:
tmp = account_trade.groupby('zhdh').apply(lambda x: ' '.join(x.sort_values(by='dt')[col_name].values))

In [13]:
sentences_list = tmp.values

In [14]:
sentences = list()
for s in tmp.values:
    sentences.append(s.split())

w2v_model = Word2Vec(sentences, vector_size=w2v_size, window=2, min_count=2, sg=0, hs=1, workers=1, seed=2022)

tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(sentences_list)
X_tfidf = tfv.transform(sentences_list)
tfidf_svd = TruncatedSVD(n_components=tfidf_size, random_state=42)
tfidf_svd.fit(X_tfidf)

cv = CountVectorizer(analyzer='word',token_pattern=u"(?u)\\b\\w+\\b")
cv.fit(sentences_list)
X_cv = cv.transform(sentences_list)
cnt_svd = TruncatedSVD(n_components=cnt_size, random_state=42)
cnt_svd.fit(X_cv)

TruncatedSVD(n_components=8, random_state=42)

In [15]:
data = []
for i in range(len(sentences_list)):
    data.append(get_w2v_mean(sentences_list[i])[0])
w2v_fea = pd.DataFrame(data, columns=[f'{col_name}_w2v_fea{i}' for i in range(w2v_size)])
w2v_fea = pd.DataFrame(data, index=tmp.index, columns=[f'{col_name}_w2v_fea{i}' for i in range(w2v_size)])
w2v_fea = w2v_fea.reset_index()

In [16]:
data = []
for i in range(len(sentences_list)):
    data.append(get_tfidf_svd(sentences[i])[0])
tfidf_fea = pd.DataFrame(data, columns=[f'{col_name}_tfidf_fea{i}' for i in range(tfidf_size)])
tfidf_fea = pd.DataFrame(data, index=tmp.index,
                         columns=[f'{col_name}_tfidf_fea{i}' for i in range(tfidf_size)])
tfidf_fea = tfidf_fea.reset_index()

In [17]:
data = []
for i in range(len(sentences_list)):
    data.append(get_cnt_svd(sentences[i])[0])
cnt_fea = pd.DataFrame(data, columns=[f'{col_name}_cnt_fea{i}' for i in range(cnt_size)])
cnt_fea = pd.DataFrame(data, index=tmp.index,
                       columns=[f'{col_name}_cnt_fea{i}' for i in range(cnt_size)])
cnt_fea = cnt_fea.reset_index()

In [18]:
with open(f'w2v_fea_{col_name}.pkl', 'wb') as file:
    pickle.dump(w2v_fea, file)

In [19]:
with open(f'tfidf_fea_{col_name}.pkl', 'wb') as file:
    pickle.dump(tfidf_fea, file)

In [20]:
with open(f'cnt_fea_{col_name}.pkl', 'wb') as file:
    pickle.dump(cnt_fea, file)

In [1]:
import numpy as np
import pandas as pd

In [55]:
account_static = pd.read_csv('./data/账户静态信息.csv')
account_trade = pd.read_csv('./data/账户交易信息.csv')
y_train = pd.read_csv('./data/训练集标签.csv')
y_test = pd.read_csv('./data/test_dataset.csv')

In [56]:
tmp = account_trade.groupby(['zhdh', 'jdbj'])['jyje'].sum().reset_index()\
    .pivot(index='zhdh', columns=['jdbj'], values=['jyje'])
tmp.columns = ['jdbj_0', 'jdbj_1']
tmp = tmp.reset_index()
tmp['in_out_diff'] = (tmp['jdbj_0'] - tmp['jdbj_1'])
tmp['in_out_ratio'] = (tmp['jdbj_0'] / tmp['jdbj_1'])

In [57]:
tmp = account_trade.groupby(['zhdh', 'jyqd'])['jyje'].sum().reset_index()\
    .pivot(index='zhdh', columns=['jyqd'], values=['jyje'])
jyqd_cols = [f'{c[1]}_sum' for c in tmp.columns]
tmp.columns = jyqd_cols
tmp = tmp.reset_index()
tmp = tmp.merge(account_trade.groupby(['zhdh'])['jyje'].sum().reset_index(), how='left', on='zhdh')
for col in jyqd_cols:
    tmp[col.split('_')[0]+'_ratio'] = tmp[col] / tmp['jyje']
tmp = tmp.drop(columns=['jyje'])    

In [32]:
tmp

Unnamed: 0_level_0,091D584F_sum,274AD478_sum,38B3EFF8_sum,3B8A6142_sum,621461AF_sum,6974CE5A_sum,6F3EF77A_sum,757B505C_sum,7EABE3A1_sum,854D6FAE_sum,...,979D472A_sum,AA169B49_sum,B6A1085A_sum,B706835D_sum,C8FBBC86_sum,E205EE2A_sum,E96ED478_sum,EAE27D77_sum,F47D0AD3_sum,F57A2F55_sum
zhdh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00037295453A928A,69309.37,,,,6768.00,,,156766.73,,,...,,,,,,,262489.20,,8107.17,
0004CC075464D54B,116516.00,,,,128400.00,,,28746.00,,,...,,,,,,,37489.50,,,
000AA77144DC1BCC,6208.00,31000.0,,,7201.00,,,62218.93,,,...,,,,2317.65,,,114778.37,,8000.00,
001ABBF3373AFC5B,,,,,8778.00,,,22413.00,,,...,,,,,,,3524.00,,,
00310769938BC172,2575795.00,89230.0,,,846437.50,,,,,40000.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FFC065BECA5E133C,708739.00,129087.8,,,76371.00,,,521325.13,,26400.0,...,,,,,,,,,,
FFD2E9065F19B38D,,,,,7288.69,,,11816.12,,17300.0,...,,,,,,,18644.67,,,
FFD48BA98FA2D299,,,,,18000.00,,,2368.36,,4196.0,...,,,,,,,18754.69,,,
FFD8F2A4DB42AC6D,545902.58,,,,488799.58,,,7631.65,,,...,,,,,,,6901.10,,,


In [28]:
col = 'jyqd'
tmp = account_trade.merge(y_train, how='left', on='zhdh').groupby(['black_flag', col])\
    .size().reset_index().rename({0: f'flag_{col}_cnt'},axis=1)
tmp2 = account_trade.merge(y_train, how='left', on='zhdh').groupby('black_flag').size()\
    .reset_index().rename({0:'flag_cnt'},axis=1)
tmp = tmp.merge(tmp2, how='left', on='black_flag')
tmp[f'p({col}|flag)'] = tmp[f'flag_{col}_cnt'] / tmp['flag_cnt']
for i in range(2):
    tmp3 = account_trade.merge(tmp[tmp['black_flag'] == i][['jyqd', f'p({col}|flag)']], how='left', on=col)\
        .groupby('zhdh').agg({f'p({col}|flag)': ['mean', 'sum', 'std', 'max']})
    tmp3.columns = [f'{c[0]}_{c[1]}_lag={i}' for c in tmp3.columns]
    tmp3 = tmp3.reset_index()
    

In [34]:
tmp3

Unnamed: 0,zhdh,p(jyqd|flag)_mean_lag=1,p(jyqd|flag)_sum_lag=1,p(jyqd|flag)_std_lag=1,p(jyqd|flag)_max_lag=1
0,00037295453A928A,0.247635,36.650025,0.099680,0.346291
1,0004CC075464D54B,0.227673,8.423919,0.101161,0.346291
2,000AA77144DC1BCC,0.283860,209.488888,0.091771,0.346291
3,001ABBF3373AFC5B,0.212148,6.576601,0.094547,0.346291
4,00310769938BC172,0.178328,6.954798,0.086726,0.287816
...,...,...,...,...,...
5995,FFC065BECA5E133C,0.061985,19.587252,0.088097,0.287816
5996,FFD2E9065F19B38D,0.289902,28.990175,0.094573,0.346291
5997,FFD48BA98FA2D299,0.241036,21.211204,0.085522,0.346291
5998,FFD8F2A4DB42AC6D,0.280045,351.176346,0.033464,0.346291


In [1]:
import numpy as np
import pandas as pd

In [2]:
account_static = pd.read_csv('./data/账户静态信息.csv')
account_trade = pd.read_csv('./data/账户交易信息.csv')
y_train = pd.read_csv('./data/训练集标签.csv')
y_test = pd.read_csv('./data/test_dataset.csv')

In [3]:
account_trade['dt'] = account_trade['jyrq'] + ' ' + account_trade['jysj']

In [4]:
account_trade['dt'] = pd.to_datetime(account_trade['dt'])

In [5]:
account_trade['jyje_label'] = pd.qcut(account_trade['jyje'], 10, labels=range(10))

In [6]:
account_trade['month'] = account_trade['dt'].dt.month
account_trade['day'] = account_trade['dt'].dt.day
account_trade['weekofyear'] = account_trade['dt'].dt.weekofyear
account_trade['dayofweek'] = account_trade['dt'].dt.dayofweek

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
account_trade['is_wknd'] = account_trade['dt'].dt.dayofweek // 6
account_trade['is_month_start'] = account_trade['dt'].dt.is_month_start.astype(int)
account_trade['is_month_end'] = account_trade['dt'].dt.is_month_end.astype(int)

In [8]:
account_trade['hour'] = account_trade['jysj'].apply(lambda x:int(x.split(':')[0]))
account_trade['minu'] = account_trade['jysj'].apply(lambda x:int(x.split(':')[1]))

In [13]:
cols = ['dfzh', 'dfhh', 'jyqd', 'zydh', 'jyje_label', 'month', 'day', 'weekofyear', 'dayofweek', 'hour', 'minu']

for c in cols:
    tmp = (account_trade.groupby([c]).size() / account_trade.shape[0])\
        .reset_index().rename(columns={0: f'{c}_ratio'})
    account_trade = account_trade.merge(tmp, how='left', on=c)

In [19]:
agg_func = {}
for c in cols:
    agg_func[f'{c}_ratio'] = ['sum','mean','max','min','std']


Unnamed: 0_level_0,dfzh_ratio,dfzh_ratio,dfzh_ratio,dfzh_ratio,dfzh_ratio,dfhh_ratio,dfhh_ratio,dfhh_ratio,dfhh_ratio,dfhh_ratio,...,hour_ratio,hour_ratio,hour_ratio,hour_ratio,hour_ratio,minu_ratio,minu_ratio,minu_ratio,minu_ratio,minu_ratio
Unnamed: 0_level_1,sum,mean,max,min,std,sum,mean,max,min,std,...,sum,mean,max,min,std,sum,mean,max,min,std
zhdh,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
00037295453A928A,7.761139,0.052440,0.123556,0.000001,0.047580,24.312500,0.164274,0.318091,0.000034,0.110545,...,5.513236,0.037252,0.064159,0.006471,0.019763,2.473613,0.016714,0.018022,0.015619,0.000469
0004CC075464D54B,0.490556,0.013258,0.068590,0.000001,0.020754,4.307805,0.116427,0.318091,0.000005,0.129779,...,2.086634,0.056396,0.064829,0.010445,0.015702,0.619942,0.016755,0.017687,0.015619,0.000467
000AA77144DC1BCC,64.086154,0.086838,0.123556,0.000001,0.047768,104.667120,0.141825,0.318091,0.000009,0.107131,...,41.119932,0.055718,0.064829,0.006679,0.012282,12.307415,0.016677,0.018022,0.015586,0.000513
001ABBF3373AFC5B,1.441768,0.046509,0.102968,0.000001,0.052083,2.845821,0.091801,0.161042,0.000026,0.078175,...,1.578410,0.050916,0.064829,0.010445,0.017365,0.515485,0.016629,0.017282,0.015619,0.000403
00310769938BC172,0.412085,0.010566,0.102968,0.000001,0.031646,0.390112,0.010003,0.031356,0.000021,0.010581,...,2.281549,0.058501,0.064829,0.039579,0.007967,0.648150,0.016619,0.017792,0.015586,0.000458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FFC065BECA5E133C,3.255200,0.010301,0.102968,0.000001,0.030612,9.237306,0.029232,0.161042,0.000009,0.059085,...,18.850545,0.059654,0.064829,0.010445,0.007578,5.349689,0.016929,0.018022,0.015616,0.000543
FFD2E9065F19B38D,6.435568,0.064356,0.123556,0.000001,0.034781,21.355640,0.213556,0.318091,0.000034,0.127682,...,5.299259,0.052993,0.064829,0.017927,0.013499,1.663293,0.016633,0.017620,0.015616,0.000433
FFD48BA98FA2D299,7.309078,0.083058,0.123556,0.000001,0.047750,13.320394,0.151368,0.318091,0.000102,0.077032,...,5.242136,0.059570,0.064829,0.026445,0.006173,1.468344,0.016686,0.018022,0.015586,0.000469
FFD8F2A4DB42AC6D,0.106606,0.000085,0.031052,0.000001,0.001470,34.055679,0.027158,0.318091,0.000001,0.027275,...,50.729745,0.040454,0.064159,0.006471,0.022159,20.920309,0.016683,0.018022,0.015586,0.000526


In [20]:
tmp = account_trade.groupby('zhdh').agg(agg_func)

In [21]:
tmp.columns = [f'{c[0]}_{c[1]}' for c in tmp.columns]

In [22]:
tmp

Unnamed: 0_level_0,dfzh_ratio_sum,dfzh_ratio_mean,dfzh_ratio_max,dfzh_ratio_min,dfzh_ratio_std,dfhh_ratio_sum,dfhh_ratio_mean,dfhh_ratio_max,dfhh_ratio_min,dfhh_ratio_std,...,hour_ratio_sum,hour_ratio_mean,hour_ratio_max,hour_ratio_min,hour_ratio_std,minu_ratio_sum,minu_ratio_mean,minu_ratio_max,minu_ratio_min,minu_ratio_std
zhdh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00037295453A928A,7.761139,0.052440,0.123556,0.000001,0.047580,24.312500,0.164274,0.318091,0.000034,0.110545,...,5.513236,0.037252,0.064159,0.006471,0.019763,2.473613,0.016714,0.018022,0.015619,0.000469
0004CC075464D54B,0.490556,0.013258,0.068590,0.000001,0.020754,4.307805,0.116427,0.318091,0.000005,0.129779,...,2.086634,0.056396,0.064829,0.010445,0.015702,0.619942,0.016755,0.017687,0.015619,0.000467
000AA77144DC1BCC,64.086154,0.086838,0.123556,0.000001,0.047768,104.667120,0.141825,0.318091,0.000009,0.107131,...,41.119932,0.055718,0.064829,0.006679,0.012282,12.307415,0.016677,0.018022,0.015586,0.000513
001ABBF3373AFC5B,1.441768,0.046509,0.102968,0.000001,0.052083,2.845821,0.091801,0.161042,0.000026,0.078175,...,1.578410,0.050916,0.064829,0.010445,0.017365,0.515485,0.016629,0.017282,0.015619,0.000403
00310769938BC172,0.412085,0.010566,0.102968,0.000001,0.031646,0.390112,0.010003,0.031356,0.000021,0.010581,...,2.281549,0.058501,0.064829,0.039579,0.007967,0.648150,0.016619,0.017792,0.015586,0.000458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FFC065BECA5E133C,3.255200,0.010301,0.102968,0.000001,0.030612,9.237306,0.029232,0.161042,0.000009,0.059085,...,18.850545,0.059654,0.064829,0.010445,0.007578,5.349689,0.016929,0.018022,0.015616,0.000543
FFD2E9065F19B38D,6.435568,0.064356,0.123556,0.000001,0.034781,21.355640,0.213556,0.318091,0.000034,0.127682,...,5.299259,0.052993,0.064829,0.017927,0.013499,1.663293,0.016633,0.017620,0.015616,0.000433
FFD48BA98FA2D299,7.309078,0.083058,0.123556,0.000001,0.047750,13.320394,0.151368,0.318091,0.000102,0.077032,...,5.242136,0.059570,0.064829,0.026445,0.006173,1.468344,0.016686,0.018022,0.015586,0.000469
FFD8F2A4DB42AC6D,0.106606,0.000085,0.031052,0.000001,0.001470,34.055679,0.027158,0.318091,0.000001,0.027275,...,50.729745,0.040454,0.064159,0.006471,0.022159,20.920309,0.016683,0.018022,0.015586,0.000526


In [12]:
account_trade.columns

Index(['jylsxh', 'zhdh', 'dfzh', 'jdbj', 'jyje', 'zhye', 'dfhh', 'jyrq',
       'jysj', 'jyqd', 'zydh', 'dfmccd', 'dt', 'jyje_label', 'month', 'day',
       'weekofyear', 'dayofweek', 'is_wknd', 'is_month_start', 'is_month_end',
       'hour', 'minu'],
      dtype='object')

In [18]:
for col in ['dfzh', 'dfhh', 'jyqd', 'zydh', 'jyje_label',
            'month', 'day', 'weekofyear', 'dayofweek', 'is_wknd',
            'is_month_start', 'is_month_end', 'hour', 'minu']:
    tmp = account_trade.groupby(['zhdh', col]).size().reset_index().groupby('zhdh')\
        .apply(lambda x: x.sort_values(by=0).iloc[0][col])\
        .reset_index().rename(columns={0: f'most_{col}'})

In [29]:
for col in ['dfzh', 'dfhh', 'jyqd', 'zydh', 'jyje_label',
            'month', 'day', 'weekofyear', 'dayofweek', 'is_wknd',
            'is_month_start', 'is_month_end', 'hour', 'minu']:
    tmp = account_trade.groupby(['zhdh', col])['jyje'].sum().reset_index().groupby('zhdh')\
            .apply(lambda x: x.sort_values(by='jyje').iloc[-1][col])\
            .reset_index().rename(columns={0: f'most_jyje_{col}'})

In [30]:
tmp

Unnamed: 0,zhdh,most_jyje_minu
0,00037295453A928A,15
1,0004CC075464D54B,28
2,000AA77144DC1BCC,32
3,001ABBF3373AFC5B,7
4,00310769938BC172,12
...,...,...
5995,FFC065BECA5E133C,24
5996,FFD2E9065F19B38D,28
5997,FFD48BA98FA2D299,53
5998,FFD8F2A4DB42AC6D,30
