### 任务目标：提取最合适的风控模型特征
- 竞赛：http://openresearch.rong360.com/#/

In [4]:
import pandas as pd
import numpy as np
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from multiprocessing import Pool
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

所有样本ID都存在以下3个文件中,一共28959个用户

In [5]:
train = pd.read_csv("data/sample_train.txt", delimiter="\t")
valid = pd.read_csv("data/valid_id.txt", delimiter="\t")
test = pd.read_csv("data/test_id.txt", delimiter="\t")
df = pd.concat([train, test, valid], axis=0)

In [6]:
df.head()

Unnamed: 0,id,label
0,25478619,1.0
1,25871453,0.0
2,43982508,0.0
3,4452511,0.0
4,6271969,0.0


In [7]:
df.shape

(28959, 2)

### 构建网络图

In [8]:
edge = pd.read_csv("data/dat_edge.txt", delimiter="\t")
edge = edge[:10000]
edge.head()

Unnamed: 0,from_id,to_id,info
0,10000019,23264041,2017-12:1_11
1,1000010,29753962,2017-12:1_27
2,10000189,15381095,2017-12:1_5
3,10000223,36347822,2017-11:1_24
4,1000023,17857485,2018-01:1_11


In [9]:
from_id = []
to_id = []
dates = []
nums = []
weights = []

In [10]:
for i, row in edge.iterrows():
    for t in row.info.split(","):
        from_id.append(row.from_id)
        to_id.append(row.to_id)

        date, nums_weight = t.split(":")
        num, weight = nums_weight.split("_")

        dates.append(date)
        nums.append(num)
        weights.append(weight)

In [11]:
graph = pd.DataFrame({"from_id": from_id, "to_id": to_id, "date": dates, "num":nums, "weight":weights})
graph.to_csv("data/graph", index=False)
graph.head()

Unnamed: 0,from_id,to_id,date,num,weight
0,10000019,23264041,2017-12,1,11
1,1000010,29753962,2017-12,1,27
2,10000189,15381095,2017-12,1,5
3,10000223,36347822,2017-11,1,24
4,1000023,17857485,2018-01,1,11


In [12]:
graph['weight'] = graph['weight'].astype(int)

平滑处理下

In [13]:
graph.weight = graph.weight + 1
a = graph.groupby(["from_id", "to_id"]).weight.max()
a = a.reset_index()
a.head()

Unnamed: 0,from_id,to_id,weight
0,10072,4575965,10
1,10119,3615703,33
2,10149,13533889,19
3,10152,11902625,16
4,10152,29295999,56


构建向量时，需要每一个边（from_id,to_id）
- 参考：https://github.com/phanein/deepwalk

In [15]:
with open("data/graph_for_emb.txt", "w") as f:
    for i, row in a.iterrows(): 
        #f.write("%d %d %d\n" % (row.from_id, row.to_id, row.weight))
        f.write("%d %d\n" % (row.from_id, row.to_id))

过滤掉ID不在df中的样本

In [16]:
graph_filter = graph[graph.from_id.isin(df.id) | graph.to_id.isin(df.id)]
graph_filter.to_csv("data/graph_filter.csv", index=False)

### 得到网络图中各项特征

In [17]:
import networkx as nx
import pickle

In [18]:
graph = pd.read_csv("data/graph")
a = graph.groupby(["from_id", "to_id"]).weight.sum()
a = a.reset_index()
a.head()

Unnamed: 0,from_id,to_id,weight
0,10072,4575965,9
1,10119,3615703,32
2,10149,13533889,18
3,10152,11902625,15
4,10152,29295999,55


In [19]:
with open("data/graph_for_pagerank.txt", "w") as f:
    for i, row in a.iterrows(): 
        f.write("%d %d %d\n" % (row.from_id, row.to_id, row.weight))

In [20]:
G = nx.DiGraph()

In [21]:
with open("data/graph_for_pagerank.txt", "r") as f:
    for line in f:
        from_id, to_id, weight = line.strip().split()
        G.add_edge(int(from_id), int(to_id), weight=int(weight))

得到ID点权重特征

In [22]:
pr = nx.pagerank(G)

In [27]:
pr

{10072: 5.522053914489443e-05,
 4575965: 0.00010188748182545973,
 10119: 5.522053914489443e-05,
 3615703: 0.00010188748182545973,
 10149: 5.522053914489443e-05,
 13533889: 0.00010188748182545973,
 10152: 5.522053914489443e-05,
 11902625: 6.236343853477688e-05,
 29295999: 8.141117024113007e-05,
 32565832: 6.045866536414156e-05,
 35285982: 6.331582512009454e-05,
 100017: 5.522053914489443e-05,
 20542313: 0.00010188748182545973,
 100027: 5.522053914489443e-05,
 13097021: 0.00010188748182545973,
 100074: 5.522053914489443e-05,
 22190751: 0.00010188748182545973,
 100163: 5.522053914489443e-05,
 27395460: 0.00010188748182545973,
 100174: 5.522053914489443e-05,
 44672416: 0.00010188748182545973,
 100252: 5.522053914489443e-05,
 7549853: 6.542214987041336e-05,
 19529694: 5.7174039071057626e-05,
 24427583: 6.607331651246775e-05,
 33356577: 6.520509432306189e-05,
 44341548: 6.889503862803683e-05,
 100268: 5.522053914489443e-05,
 17338657: 0.00010188748182545973,
 100292: 5.522053914489443e-05,
 

In [24]:
with open("data/edge/pagerank.pkl", "wb") as f:
    pickle.dump(pr, f)

Authority页面是指与某个领域或者某个话题相关的高质量页面，Hub页面则是包含很多指向高质量Authority页面链接的网页，比如，hao123首页就是一个典型的高质量Hub页。

- 参考：https://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.algorithms.link_analysis.hits_alg.hits.html

In [25]:
h,a=nx.hits(G)

In [28]:
a

{10072: 0.0,
 4575965: 1.7968010802992432e-186,
 10119: 0.0,
 3615703: 2.2427502457724442e-158,
 10149: 0.0,
 13533889: 4.046036337847288e-171,
 10152: 0.0,
 11902625: 7.099870257731885e-145,
 29295999: 2.6032857611683577e-144,
 32565832: 5.206571522336715e-145,
 35285982: 8.046519625429468e-145,
 100017: 0.0,
 20542313: 3.8631928563090153e-95,
 100027: 0.0,
 13097021: 9.61772824456159e-121,
 100074: 0.0,
 22190751: 1.8540540434319086e-101,
 100163: 0.0,
 27395460: 4.230342856474185e-180,
 100174: 0.0,
 44672416: 4.9380648351987834e-157,
 100252: 0.0,
 7549853: 1.4823867252481993e-132,
 19529694: 2.8386128781348503e-133,
 24427583: 1.5770071545193612e-132,
 33356577: 1.4508465821578126e-132,
 44341548: 1.987029014694395e-132,
 100268: 0.0,
 17338657: 2.5075118048910396e-178,
 100292: 0.0,
 39275498: 1.1263898816633342e-166,
 100293: 0.0,
 20204035: 2.7698871011055593e-96,
 100408: 0.0,
 3351105: 2.1132606591114617e-111,
 18495964: 6.365242949130908e-112,
 41423174: 6.594391695299621e-1

In [26]:
h

{10072: 4.096673711556637e-190,
 4575965: 0.0,
 10119: 1.8181083307428405e-161,
 3615703: 0.0,
 10149: 1.8449778200824978e-174,
 13533889: 0.0,
 10152: 4.388631383017179e-147,
 11902625: 0.0,
 29295999: 0.0,
 32565832: 0.0,
 35285982: 0.0,
 100017: 5.441392409108041e-97,
 20542313: 0.0,
 100027: 4.263825433908315e-123,
 13097021: 0.0,
 100074: 1.9633039324382412e-103,
 22190751: 0.0,
 100163: 1.2860139471200655e-183,
 27395460: 0.0,
 100174: 4.2532858411310546e-160,
 44672416: 0.0,
 100252: 8.689239953856753e-135,
 7549853: 0.0,
 19529694: 0.0,
 24427583: 0.0,
 33356577: 0.0,
 44341548: 0.0,
 100268: 8.258006190798162e-182,
 17338657: 0.0,
 100292: 6.2776960857391934e-170,
 39275498: 0.0,
 100293: 3.7049713664413944e-98,
 20204035: 0.0,
 100408: 4.8114232185692e-113,
 3351105: 0.0,
 18495964: 0.0,
 41423174: 0.0,
 100584: 6.2776960857391934e-170,
 21500760: 0.0,
 100750: 1.2860139471200655e-183,
 26191548: 0.0,
 100756: 5.1925779701110745e-123,
 328964: 0.0,
 1905517: 0.0,
 3521212: 0.

一个节点的节点度越大就意味着这个节点的度中心性越高，该节点在网络中就越重要。

In [29]:
dc = nx.algorithms.centrality.degree_centrality(G)

In [30]:
dc

{10072: 6.955554009876887e-05,
 4575965: 6.955554009876887e-05,
 10119: 6.955554009876887e-05,
 3615703: 6.955554009876887e-05,
 10149: 6.955554009876887e-05,
 13533889: 6.955554009876887e-05,
 10152: 0.0002782221603950755,
 11902625: 6.955554009876887e-05,
 29295999: 6.955554009876887e-05,
 32565832: 6.955554009876887e-05,
 35285982: 6.955554009876887e-05,
 100017: 6.955554009876887e-05,
 20542313: 6.955554009876887e-05,
 100027: 6.955554009876887e-05,
 13097021: 6.955554009876887e-05,
 100074: 6.955554009876887e-05,
 22190751: 6.955554009876887e-05,
 100163: 6.955554009876887e-05,
 27395460: 6.955554009876887e-05,
 100174: 6.955554009876887e-05,
 44672416: 6.955554009876887e-05,
 100252: 0.00034777770049384437,
 7549853: 6.955554009876887e-05,
 19529694: 6.955554009876887e-05,
 24427583: 6.955554009876887e-05,
 33356577: 6.955554009876887e-05,
 44341548: 6.955554009876887e-05,
 100268: 6.955554009876887e-05,
 17338657: 6.955554009876887e-05,
 100292: 6.955554009876887e-05,
 39275498:

In [31]:
with open("data/edge/h.pkl", "wb") as f:
    pickle.dump(h, f)

with open("data/edge/a.pkl", "wb") as f:
    pickle.dump(a, f)

dc = nx.algorithms.centrality.degree_centrality(G)
with open("data/edge/degree_centrality.pkl", "wb") as f:
    pickle.dump(dc, f)

### 获取网络图中每个点的编码特征

- 1.首先安装deepwalk工具包：pip install deepwalk
- 2.最好配置一下deepwalk命令的环境变量
- 3.构建向量特征

- 参考：https://github.com/phanein/deepwalk

### 筛选只在df中出现的节点特征

In [4]:
graph = pd.read_csv("data/graph")
graph_filter = graph[graph.from_id.isin(df.id) | graph.to_id.isin(df.id)]
graph_filter.to_csv("data/graph_filter.csv", index=False)

graph_emb = pd.read_csv("data/deepwalk_192.emb", delimiter=" ", names=["id"] + ["dp_%d" % i for i in range(192)], skiprows=1)
graph_emb[graph_emb.id.isin(df.id)].to_csv("features/graph/deepwalk_192_filtered.emb", index=False)

graph_emb = pd.read_csv("data/deepwalk_128.emb", delimiter=" ", names=["id"] + ["dp_%d" % i for i in range(128)], skiprows=1)
graph_emb[graph_emb.id.isin(df.id)].to_csv("features/graph/deepwalk_128_filtered.emb", index=False)

graph_emb = pd.read_csv("data/deepwalk_256.emb", delimiter=" ", names=["id"] + ["dp_%d" % i for i in range(256)], skiprows=1)
graph_emb[graph_emb.id.isin(df.id)].to_csv("features/graph/deepwalk_256_filtered.emb", index=False)

### 风险特征

In [34]:
risk = pd.read_csv("data/dat_risk.txt", delimiter="\t")
risk.head()

Unnamed: 0,id,a_cnt,b_cnt,c_cnt,d_cnt,e_cnt
0,16,1,1,0,0,0
1,29,1,1,0,0,0
2,62,1,1,0,0,0
3,63,1,1,0,0,0
4,87,1,1,0,0,0


In [35]:
risk['total'] = risk[["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt"]].sum(axis=1)
for c in ["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt"]:
    risk[c + "_ratio"] = risk[c] / risk.total
risk.head()

Unnamed: 0,id,a_cnt,b_cnt,c_cnt,d_cnt,e_cnt,total,a_cnt_ratio,b_cnt_ratio,c_cnt_ratio,d_cnt_ratio,e_cnt_ratio
0,16,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
1,29,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
2,62,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
3,63,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0
4,87,1,1,0,0,0,2,0.5,0.5,0.0,0.0,0.0


In [36]:
risk.to_csv("features/risk/risk.csv", index=False)

### 类别特征

In [37]:
dat_symbol = pd.read_csv("data/dat_symbol.txt", delimiter="\t")
dat_symbol.head()

Unnamed: 0,id,symbol
0,2,其他公司类_其他
1,7,其他公司类_其他
2,40,其他公司类_其他
3,53,"互金公司_p2p,贷款类_其他"
4,60,其他公司类_其他


In [38]:
dat_symbol['cat_count'] = dat_symbol.symbol.apply(lambda x: len(x.split(","))) #计数
dat_symbol['symbol'] = dat_symbol.symbol.apply(lambda x:" ".join([i for i in x.split(",")])) 
dat_symbol['symbol_1'] = dat_symbol.symbol.apply(lambda x:" ".join([i.split("_")[0] for i in x.split(" ")]))#取第一类
dat_symbol.head()

Unnamed: 0,id,symbol,cat_count,symbol_1
0,2,其他公司类_其他,1,其他公司类
1,7,其他公司类_其他,1,其他公司类
2,40,其他公司类_其他,1,其他公司类
3,53,互金公司_p2p 贷款类_其他,2,互金公司 贷款类
4,60,其他公司类_其他,1,其他公司类


In [40]:
vectorizer = CountVectorizer()
a = vectorizer.fit_transform(dat_symbol.symbol)
lev2 = pd.DataFrame(a.toarray(), columns=["lev_2_" + str(i) for i in range(44)])
lev2.head()

Unnamed: 0,lev_2_0,lev_2_1,lev_2_2,lev_2_3,lev_2_4,lev_2_5,lev_2_6,lev_2_7,lev_2_8,lev_2_9,...,lev_2_34,lev_2_35,lev_2_36,lev_2_37,lev_2_38,lev_2_39,lev_2_40,lev_2_41,lev_2_42,lev_2_43
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [41]:
b = vectorizer.fit_transform(dat_symbol.symbol_1)
lev1 = pd.DataFrame(b.toarray(), columns=["lev_1_" + str(i) for i in range(24)])
lev1.head()

Unnamed: 0,lev_1_0,lev_1_1,lev_1_2,lev_1_3,lev_1_4,lev_1_5,lev_1_6,lev_1_7,lev_1_8,lev_1_9,...,lev_1_14,lev_1_15,lev_1_16,lev_1_17,lev_1_18,lev_1_19,lev_1_20,lev_1_21,lev_1_22,lev_1_23
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
dat_symbol = dat_symbol.join(lev1)
dat_symbol = dat_symbol.join(lev2)
dat_symbol.drop(["symbol", "symbol_1"], axis=1, inplace=True)
dat_symbol.head()

Unnamed: 0,id,cat_count,lev_1_0,lev_1_1,lev_1_2,lev_1_3,lev_1_4,lev_1_5,lev_1_6,lev_1_7,...,lev_2_34,lev_2_35,lev_2_36,lev_2_37,lev_2_38,lev_2_39,lev_2_40,lev_2_41,lev_2_42,lev_2_43
0,2,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,40,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,53,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
dat_symbol.to_csv("features/symbol/symbol.csv", index=False)

### 图特征

In [44]:
graph_filter.date = pd.to_datetime(graph_filter.date)
graph_filter.head()

Unnamed: 0,from_id,to_id,date,num,weight
128,1000221,21681306,2017-11-01,2,35
198,10003349,20429275,2017-11-01,1,41
221,10003968,31945112,2017-12-01,3,267
222,10003968,7210426,2017-11-01,6,231
518,10008418,11675834,2017-11-01,4,371


In [45]:
out_degree = graph_filter.groupby("from_id").to_id.count().reset_index().rename(columns={"from_id":"id", "to_id": "out_degree"})
in_degree = graph_filter.groupby("to_id").from_id.count().reset_index().rename(columns={"to_id": "id", "from_id":"in_degree"})

In [49]:
out_num = graph_filter.groupby("from_id").num.sum().reset_index().rename(columns={"from_id":"id", "num": "out_sum"})
in_num = graph_filter.groupby("to_id").num.sum().reset_index().rename(columns={"to_id": "id", "num":"in_sum"})

In [51]:
in_weight = graph_filter.groupby("to_id").weight.sum().reset_index().rename(columns={"to_id": "id", "weight":"in_weight"})
out_weight = graph_filter.groupby("from_id").weight.sum().reset_index().rename(columns={"from_id":"id", "weight": "out_weight"})

In [52]:
in_unique = graph_filter.groupby("to_id").from_id.nunique().reset_index().rename(columns={"to_id":"id", "from_id": "in_nunique"})
out_unique = graph_filter.groupby("from_id").to_id.nunique().reset_index().rename(columns={"from_id":"id", "to_id": "out_nunique"})

In [57]:
graph_info = df[['id']]
graph_info = graph_info.merge(out_degree, on="id")
graph_info = graph_info.merge(in_degree, on="id")

graph_info = graph_info.merge(out_num, on="id")
graph_info = graph_info.merge(in_num, on="id")

graph_info = graph_info.merge(out_weight, on="id")
graph_info = graph_info.merge(in_weight, on="id")

graph_info = graph_info.merge(out_unique, on="id")
graph_info = graph_info.merge(in_unique, on="id")

In [58]:
common_id = set(graph_filter.from_id.tolist()) & set(graph_filter.to_id.tolist())
from_dict = {}
for i in df.id:
    from_dict[i] = set(graph_filter[graph_filter.from_id == i].to_id.values)
to_dict = {}
for i in df.id:
    to_dict[i] = set(graph_filter[graph_filter.to_id == i].from_id.values)
common_id = {}
for i in df.id:
    common_id[i] = from_dict[i] & to_dict[i]
graph_info['common_num'] = graph_info.id.apply(lambda x: len(common_id[x]))
graph_info.to_csv("features/graph/graph_info.csv", index=False)

### app安装特征

In [60]:
app = pd.read_csv("data/dat_app.txt", delimiter="\t", header=None, names=["id", "app_list"])
app.head()

Unnamed: 0,id,app_list
0,155,"234884,404900,322191,353350,365633,372053,3580..."
1,295,"374989,224028,233710,43891,43861,245685,238780..."
2,390,"365633,247448,242120,11285,208393,199718,38411..."
3,665,"450490,62347,188342,444688,347009,416584,33362..."
4,725,"374989,367185,407398,442665,391809,414377,1296..."


In [61]:
app = app[app.id.isin(graph_filter.from_id) | app.id.isin(graph_filter.to_id)]
app['apps'] = app.app_list.apply(lambda x: " ".join(x.split(",")))
app.head()

Unnamed: 0,id,app_list,apps
12428,1011185,"372028,381831,191780,395129,176612,326519,3265...",372028 381831 191780 395129 176612 326519 3265...
122649,10017015,"294831,315735,314736,320869,93111,91175,91660,...",294831 315735 314736 320869 93111 91175 91660 ...
124051,10134265,"160398,419357,80514,374861,52355,55961,327743,...",160398 419357 80514 374861 52355 55961 327743 ...
124111,10139810,"108512,138497,401310,401180,384392,442627,3999...",108512 138497 401310 401180 384392 442627 3999...
151298,12360160,"294831,306250,247916,69417,431282,435475,42172...",294831 306250 247916 69417 431282 435475 42172...


In [62]:
d = {}
def count(x):
    for i in x.split(" "):
        d[i] = d.get(i, 0) + 1

_ = app.apps.apply(count)
app.head()

Unnamed: 0,id,app_list,apps
12428,1011185,"372028,381831,191780,395129,176612,326519,3265...",372028 381831 191780 395129 176612 326519 3265...
122649,10017015,"294831,315735,314736,320869,93111,91175,91660,...",294831 315735 314736 320869 93111 91175 91660 ...
124051,10134265,"160398,419357,80514,374861,52355,55961,327743,...",160398 419357 80514 374861 52355 55961 327743 ...
124111,10139810,"108512,138497,401310,401180,384392,442627,3999...",108512 138497 401310 401180 384392 442627 3999...
151298,12360160,"294831,306250,247916,69417,431282,435475,42172...",294831 306250 247916 69417 431282 435475 42172...


In [63]:
app['app_num'] = app.apps.apply(lambda x: len(x.split(" ")))
app["app_freq_sum"] = app.apps.apply(lambda x: sum([d[i] for i in x.split(" ")]))
app['app_num_mean'] = app.app_freq_sum / app.app_num

app['app_freq_max'] = app.apps.apply(lambda x: max([d[i] for i in x.split(" ")]))
app['app_freq_min'] = app.apps.apply(lambda x: min([d[i] for i in x.split(" ")]))
app['app_freq_median'] = app.apps.apply(lambda x: np.median([d[i] for i in x.split(" ")]))
app['app_freq_var'] = app.apps.apply(lambda x: np.var([d[i] for i in x.split(" ")]))
app.head()

Unnamed: 0,id,app_list,apps,app_num,app_freq_sum,app_num_mean,app_freq_max,app_freq_min,app_freq_median,app_freq_var
12428,1011185,"372028,381831,191780,395129,176612,326519,3265...",372028 381831 191780 395129 176612 326519 3265...,335,4708,14.053731,51,1,10.0,205.47174
122649,10017015,"294831,315735,314736,320869,93111,91175,91660,...",294831 315735 314736 320869 93111 91175 91660 ...,322,3103,9.636646,51,1,2.0,215.181638
124051,10134265,"160398,419357,80514,374861,52355,55961,327743,...",160398 419357 80514 374861 52355 55961 327743 ...,207,3617,17.47343,51,1,11.0,233.505333
124111,10139810,"108512,138497,401310,401180,384392,442627,3999...",108512 138497 401310 401180 384392 442627 3999...,228,4509,19.776316,51,1,14.0,202.770141
151298,12360160,"294831,306250,247916,69417,431282,435475,42172...",294831 306250 247916 69417 431282 435475 42172...,348,3147,9.043103,51,1,2.0,198.236648


In [64]:
app_info = app[app.id.isin(df.id)]
app_info[["id", "app_num", "app_freq_sum", "app_num_mean", "app_freq_median", "app_freq_var"]].to_csv("features/app/app_info.csv", index=False)

In [67]:
a = pd.DataFrame({"app":list(d.keys()), "count": list(d.values())})
vocab = a.sort_values("count", ascending=False).head(4000).app.tolist()
vectorizer = CountVectorizer(vocabulary=vocab)
vector = vectorizer.fit_transform(app.apps)

In [68]:
dim = 16
pca = PCA(n_components=dim)
pca_res = pca.fit_transform(vector.toarray())
app_pca = pd.DataFrame(pca_res, columns=["pca_%d" % i for i in range(dim)])
app_pca["id"] = app.id.values
app_pca.head()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14,pca_15,id
0,-3.351883,8.317413,-1.672377,-6.093019,2.817171,-0.345753,1.863733,5.791547,-2.121401,-0.942125,-0.635455,0.327999,0.966814,2.869158,0.894377,-2.580353,1011185
1,-2.089149,-1.452575,1.635781,7.376376,11.761313,-2.082388,-1.620871,1.267059,0.537942,0.595775,-0.767154,0.304867,7.70906,-1.109695,-0.451463,0.403409,10017015
2,-3.625172,6.003083,-2.392578,4.291447,-3.023322,-1.491179,-0.237069,-2.444953,-1.34018,-2.662225,-0.865735,0.039801,0.156401,2.012271,-2.235904,0.088108,10134265
3,9.142872,-0.364853,-1.009831,-0.239466,-0.482725,-0.371293,-0.785761,0.39534,-0.461976,1.336369,-0.193951,-0.493852,-0.07101,2.19213,1.271705,0.489245,10139810
4,-2.228124,-1.647366,1.141115,6.202223,13.703969,-2.531514,-0.090936,-1.029267,-1.072539,0.224244,-0.198602,-0.506953,-7.133332,1.152459,0.838966,-0.571463,12360160


In [69]:
app_pca.to_csv("features/app/app_pca_%d.csv" % dim, index=False)

### 图中联系特征

![title](2.png)

![title](1.png)

In [83]:
def feature_with_graph(graph_filter, other_df, feature_cols, to_dir, new_col_name, func, weight_type, ids=df.id):
    task_name = "%s%s_%s" % (new_col_name, weight_type, func)
    start = time.time()
    to_df = graph_filter.rename(columns={"to_id": "id"}).merge(other_df, on="id", how="left").drop("id", axis=1).rename(columns={"from_id":"id"})
    from_df = graph_filter.rename(columns={"from_id": "id"}).merge(other_df, on="id", how="left").drop("id", axis=1).rename(columns={"to_id":"id"})
    
    to_df = to_df.merge(to_df.groupby("id")["num", "weight"].sum().reset_index().rename(columns={"num":"num_sum_total", "weight":"weight_sum_total"}), on="id", how="left")
    from_df = from_df.merge(from_df.groupby("id")["num", "weight"].sum().reset_index().rename(columns={"num":"num_sum_total", "weight":"weight_sum_total"}), on="id", how="left")
    
    if weight_type == "_num":
        for f in feature_cols:
            to_df[f] = to_df[f] * to_df["num"]
            from_df[f] = from_df[f] * from_df["num"]
    elif weight_type == "_weight":
        for f in feature_cols:
            to_df[f] = to_df[f] * to_df["weight"]
            from_df[f] = from_df[f] * from_df["weight"]

    if weight_type in ["_num", "_weight"] and func == "mean":
        to_df[f] /= to_df[weight_type[1:] + "_sum_total"]
        from_df[f] /= from_df[weight_type[1:] + "_sum_total"]
        a = to_df.groupby("id")[feature_cols].agg("sum").reset_index()
        b = from_df.groupby("id")[feature_cols].agg("sum").reset_index()
    else:
        a = to_df.groupby("id")[feature_cols].agg(func).reset_index()
        b = from_df.groupby("id")[feature_cols].agg(func).reset_index()
    
    if new_col_name == "symbol":
        a['to_%s%s_count' % (new_col_name, weight_type)] = a[[c for c in a.columns if c != "id"]].sum(axis=1)
        b['from_%s%s_count' % (new_col_name, weight_type)] = b[[c for c in b.columns if c != "id"]].sum(axis=1)

    a.columns = ["id"] + ["to_%s%s_%s_%d" % (new_col_name, weight_type, func, i) for i in range(1, len(a.columns))]
    b.columns = ["id"] + ["from_%s%s_%s_%d" % (new_col_name, weight_type, func, i) for i in range(1, len(a.columns))]

    a[a.id.isin(ids)].to_csv("features/%s/to_%s%s_%s.csv" % (to_dir, new_col_name, weight_type, func), index=False)
    b[b.id.isin(ids)].to_csv("features/%s/from_%s%s_%s.csv" % (to_dir, new_col_name, weight_type, func), index=False)
    
    end = time.time()
    print('Task %s runs %0.2f seconds.' % (task_name, (end - start)))

一度联系人特征制作

In [73]:
dat_symbol = pd.read_csv("features/symbol/symbol.csv")
lev_f = []
for f in dat_symbol.columns:
    if f[:5] == "lev_1":
        lev_f.append(f)

In [77]:
symbol_args_list = [
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "sum", ""),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "sum", "_num"),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "sum", "_weight"),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "mean", ""),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "mean", "_num"),
    (graph_filter, dat_symbol, lev_f, "symbol_graph", "symbol", "mean", "_weight"),
]

In [78]:
app_pca = pd.read_csv("features/app/app_pca_16.csv")

In [79]:
app_args_list = [
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", ""),
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", "_num"),
    (graph_filter, app_pca, ["pca_%d" % i for i in range(16)], "app_graph", "app_pca", "mean", "_weight")
]

In [80]:
risk = pd.read_csv("features/risk/risk.csv")
risk_f = ["a_cnt", "b_cnt", "c_cnt", "d_cnt", "e_cnt", "total"]
risk_ratio_f = ["a_cnt_ratio", "b_cnt_ratio", "c_cnt_ratio", "d_cnt_ratio", "e_cnt_ratio"]

In [81]:
risk_args_list = [
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", ""),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", "_num"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "mean", "_weight"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", ""),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", "_num"),
    (graph_filter, risk, risk_f, "risk_graph", "risk", "sum", "_weight"),
    (graph_filter, risk, risk_ratio_f, "risk_graph", "risk_ratio", "mean", ""),
    (graph_filter, risk, risk_ratio_f, "risk_graph", "risk_ratio", "mean", "_num"),
    (graph_filter, risk, risk_ratio_f, "risk_graph", "risk_ratio", "mean", "_weight")
]

In [None]:
p = Pool(10)
for args in symbol_args_list + app_args_list + risk_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

In [21]:
graph = pd.read_csv("data/graph")

graph_filter_ids = set(graph_filter.to_id.tolist()) | set(graph_filter.from_id.tolist())

In [22]:
from_filtered = graph[graph.from_id.isin(graph_filter_ids)]
to_filtered = graph[graph.to_id.isin(graph_filter_ids)]

d1_to = from_filtered.groupby("from_id").to_id.count()
d1_from = to_filtered.groupby("to_id").from_id.count()

d1_to_sum = from_filtered.groupby("from_id")["num", "weight"].sum()
d1_from_sum = to_filtered.groupby("to_id")["num", "weight"].sum()

d1_to = d1_to.reset_index().merge(d1_to_sum.reset_index(), on="from_id", how="left").rename(columns={"to_id":"count"})
d1_from = d1_from.reset_index().merge(d1_from_sum.reset_index(), on="to_id", how="left").rename(columns={"from_id":"count"})

d1_to = d1_to.rename(columns={"from_id":"id", "num": "num_sum", "weight":"weight_sum"})
d1_from = d1_from.rename(columns={"to_id":"id", "num": "num_sum", "weight":"weight_sum"})

In [23]:
d1_to["num_mean"] = d1_to.num_sum / d1_to["count"]
d1_to["weight_mean"] = d1_to.weight_sum / d1_to["count"]

d1_from["num_mean"] = d1_from.num_sum / d1_from["count"]
d1_from["weight_mean"] = d1_from.weight_sum / d1_from["count"]

In [24]:
d2_f = ["count", "num_mean", "weight_mean"]
d2_args_list = [
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "sum", ""),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "sum", "_weight"),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "sum", "_num"),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "mean", ""),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "mean", "_weight"),
    (graph_filter, d1_to, d2_f, "graph", "d2_to", "mean", "_num"),
    
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "sum", ""),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "sum", "_weight"),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "sum", "_num"),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "mean", ""),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "mean", "_num"),
    (graph_filter, d1_from, d2_f, "graph", "d2_from", "mean", "_weight"),
]

In [25]:
p = Pool(10)
for args in d2_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task d2_to_sum runs 24.58 seconds.
Task d2_to_weight_sum runs 27.43 seconds.
Task d2_to_mean runs 26.62 seconds.
Task d2_to_num_sum runs 29.36 seconds.
Task d2_to_weight_mean runs 28.06 seconds.
Task d2_to_num_mean runs 28.30 seconds.
Task d2_from_sum runs 29.14 seconds.
Task d2_from_weight_sum runs 28.73 seconds.
Task d2_from_num_sum runs 28.76 seconds.
Task d2_from_mean runs 27.15 seconds.
Task d2_from_num_mean runs 19.73 seconds.
Task d2_from_weight_mean runs 22.14 seconds.
All subprocesses done.


PageRank

In [26]:
with open("data/edge/pagerank.plk", "rb") as f:
    pr = pickle.load(f)

pr_df = pd.DataFrame({"id": list(pr.keys()), "pr":list(pr.values())})
pr_df[pr_df.id.isin(df.id)].to_csv("features/graph/pagerank.csv", index=False)

In [27]:
pr_f = ['pr']
pr_args_list = [
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "", ),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "_weight", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "sum", "_num", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "_weight", graph_filter_ids),
    (graph_filter, pr_df, pr_f, "graph", "pagerank", "mean", "_num", graph_filter_ids),
]

In [28]:
p = Pool(6)
for args in pr_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task pagerank_sum runs 65.19 seconds.
Task pagerank_weight_sum runs 77.54 seconds.
Task pagerank_num_sum runs 76.52 seconds.
Task pagerank_mean runs 75.12 seconds.
Task pagerank_weight_mean runs 75.42 seconds.
Task pagerank_num_mean runs 77.08 seconds.
All subprocesses done.


PageRank 二度

In [29]:
gf = graph[graph.from_id.isin(graph_filter_ids) | graph.to_id.isin(graph_filter_ids)]

In [30]:
pr_f = ['pr']
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "sum", "", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "sum", "_weight", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "sum", "_num", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "mean", "", graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "mean", "_weight",graph_filter_ids)
feature_with_graph(gf, pr_df, pr_f, "temp", "pagerank", "mean", "_num", graph_filter_ids)

Task pagerank_sum runs 1045.24 seconds.
Task pagerank_weight_sum runs 1064.47 seconds.
Task pagerank_num_sum runs 1053.86 seconds.
Task pagerank_mean runs 1052.96 seconds.
Task pagerank_weight_mean runs 1063.16 seconds.
Task pagerank_num_mean runs 1066.43 seconds.


In [31]:
to_num_mean = pd.read_csv("features/temp/to_pagerank_num_mean.csv")
from_num_mean = pd.read_csv("features/temp/from_pagerank_num_mean.csv")

to_weight_mean = pd.read_csv("features/temp/to_pagerank_weight_mean.csv")
from_weight_mean = pd.read_csv("features/temp/from_pagerank_weight_mean.csv")

to_num_sum = pd.read_csv("features/temp/to_pagerank_num_sum.csv")
from_num_sum = pd.read_csv("features/temp/from_pagerank_num_sum.csv")

to_weight_sum = pd.read_csv("features/temp/to_pagerank_weight_sum.csv")
from_weight_sum = pd.read_csv("features/temp/from_pagerank_weight_sum.csv")

In [32]:
from_num_mean.head()

Unnamed: 0,id,from_pagerank_num_mean_1
0,22,2.940365e-08
1,24,8.918203e-07
2,80,9.372466e-07
3,91,1.578345e-08
4,117,9.312466e-06


In [33]:
pr2_args_list = [
    (graph_filter, to_num_mean, ['to_pagerank_num_mean_1'], "graph", "pg_to_num_mean", "mean", "_num"),
    (graph_filter, from_num_mean, ["from_pagerank_num_mean_1"], "graph", "pg_from_num_mean", "mean", "_num"),
    (graph_filter, to_weight_mean, ['to_pagerank_weight_mean_1'], "graph", "pg_to_weight_mean", "mean", "_weight"),
    (graph_filter, from_weight_mean, ["from_pagerank_weight_mean_1"], "graph", "pg_from_weight_mean", "mean", "_weight"),
    
    (graph_filter, to_num_sum, ['to_pagerank_num_sum_1'], "graph", "pg_to_num_sum", "sum", "_num"),
    (graph_filter, from_num_sum, ["from_pagerank_num_sum_1"], "graph", "pg_from_num_sum", "sum", "_num"),
    (graph_filter, to_weight_sum, ['to_pagerank_weight_sum_1'], "graph", "pg_to_weight_sum", "sum", "_weight"),
    (graph_filter, from_weight_sum, ["from_pagerank_weight_sum_1"], "graph", "pg_from_weight_sum", "sum", "_weight"),
]

In [34]:
p = Pool(4)
for args in pr2_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task pg_to_num_mean_num_mean runs 19.40 seconds.
Task pg_from_num_mean_num_mean runs 18.77 seconds.
Task pg_to_weight_mean_weight_mean runs 19.28 seconds.
Task pg_from_weight_mean_weight_mean runs 20.31 seconds.
Task pg_from_num_sum_num_sum runs 16.08 seconds.
Task pg_to_num_sum_num_sum runs 16.96 seconds.
Task pg_to_weight_sum_weight_sum runs 16.75 seconds.
Task pg_from_weight_sum_weight_sum runs 16.41 seconds.
All subprocesses done.


hits

In [35]:
with open("data/edge/a.plk", "rb") as f:
    a = pickle.load(f)

a_df = pd.DataFrame({"id": list(a.keys()), "a":list(a.values())})

with open("data/edge/h.plk", "rb") as f:
    h = pickle.load(f)

h_df = pd.DataFrame({"id": list(a.keys()), "h":list(h.values())})


hits = a_df.merge(h_df, on="id")
hits[a_df.id.isin(df.id)].to_csv("features/graph/hits.csv", index=False)

In [36]:
hits.head()

Unnamed: 0,id,a,h
0,2,0.0,2.297083e-21
1,16872051,4.5465880000000004e-17,5.140984e-21
2,3,0.0,1.163187e-24
3,6907348,4.136847e-21,2.879042e-16
4,7911933,7.092638e-23,8.419130000000001e-18


In [37]:
hits_f = ['a', "h"]
hits_args_list = [
    (graph_filter, hits, hits_f, "graph", "hits", "sum", ""),
    (graph_filter, hits, hits_f, "graph", "hits", "sum", "_weight"),
    (graph_filter, hits, hits_f, "graph", "hits", "sum", "_num"),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", ""),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", "_weight"),
    (graph_filter, hits, hits_f, "graph", "hits", "mean", "_num"),
]

In [38]:
p = Pool(10)
for args in hits_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task hits_sum runs 65.34 seconds.
Task hits_weight_sum runs 66.21 seconds.
Task hits_num_sum runs 68.90 seconds.
Task hits_mean runs 64.11 seconds.
Task hits_weight_mean runs 63.83 seconds.
Task hits_num_mean runs 62.85 seconds.
All subprocesses done.


hits 二度

In [39]:
hits_f = ['a', "h"]
feature_with_graph(gf, hits, hits_f, "temp", "hits", "sum", "", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "sum", "_weight", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "sum", "_num", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "mean", "", graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "mean", "_weight",graph_filter_ids)
feature_with_graph(gf, hits, hits_f, "temp", "hits", "mean", "_num", graph_filter_ids)

Task hits_sum runs 1122.12 seconds.
Task hits_weight_sum runs 1138.46 seconds.
Task hits_num_sum runs 1144.98 seconds.
Task hits_mean runs 1152.57 seconds.
Task hits_weight_mean runs 1169.23 seconds.
Task hits_num_mean runs 1146.93 seconds.


In [40]:
to_num_mean = pd.read_csv("features/temp/to_hits_num_mean.csv")
from_num_mean = pd.read_csv("features/temp/from_hits_num_mean.csv")

to_weight_mean = pd.read_csv("features/temp/to_hits_weight_mean.csv")
from_weight_mean = pd.read_csv("features/temp/from_hits_weight_mean.csv")

to_num_sum = pd.read_csv("features/temp/to_hits_num_sum.csv")
from_num_sum = pd.read_csv("features/temp/from_hits_num_sum.csv")

to_weight_sum = pd.read_csv("features/temp/to_hits_weight_sum.csv")
from_weight_sum = pd.read_csv("features/temp/from_hits_weight_sum.csv")

In [41]:
hits2_args_list = [
    (graph_filter, to_num_mean, ['to_hits_num_mean_1', 'to_hits_num_mean_2'], "graph", "hits_to_num_mean", "mean", "_num"),
    (graph_filter, from_num_mean, ["from_hits_num_mean_1", "from_hits_num_mean_2"], "graph", "hits_from_num_mean", "mean", "_num"),
    (graph_filter, to_weight_mean, ['to_hits_weight_mean_1', 'to_hits_weight_mean_2'], "graph", "hits_to_weight_mean", "mean", "_weight"),
    (graph_filter, from_weight_mean, ["from_hits_weight_mean_1", "from_hits_weight_mean_2"], "graph", "hits_from_weight_mean", "mean", "_weight"),
    
    (graph_filter, to_num_sum, ['to_hits_num_sum_1', 'to_hits_num_sum_2'], "graph", "hits_to_num_sum", "sum", "_num"),
    (graph_filter, from_num_sum, ["from_hits_num_sum_1", "from_hits_num_sum_2"], "graph", "hits_from_num_sum", "sum", "_num"),
    (graph_filter, to_weight_sum, ['to_hits_weight_sum_1', 'to_hits_weight_sum_2'], "graph", "hits_to_weight_sum", "sum", "_weight"),
    (graph_filter, from_weight_sum, ["from_hits_weight_sum_1", "from_hits_weight_sum_2"], "graph", "hits_from_weight_sum", "sum", "_weight"),
]

In [42]:
p = Pool(10)
for args in hits2_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task hits_to_num_mean_num_mean runs 21.48 seconds.
Task hits_from_num_mean_num_mean runs 23.92 seconds.
Task hits_from_weight_mean_weight_mean runs 23.51 seconds.
Task hits_to_weight_mean_weight_mean runs 25.11 seconds.
Task hits_to_num_sum_num_sum runs 23.70 seconds.
Task hits_from_num_sum_num_sum runs 23.28 seconds.
Task hits_to_weight_sum_weight_sum runs 22.55 seconds.
Task hits_from_weight_sum_weight_sum runs 22.57 seconds.
All subprocesses done.


In [3]:
import pickle

dc

In [8]:
with open("data/edge/degree_centrality.plk", "rb") as f:
    a = pickle.load(f)

a_df = pd.DataFrame({"id": list(a.keys()), "a":list(a.values())})

In [12]:
a_df[a_df.id.isin(df.id)].to_csv("features/graph/dc.csv", index=False)

In [15]:
f = ['a']
a_df_args_list = [
    (graph_filter, a_df, f, "graph", "dc", "sum", ""),
    (graph_filter, a_df, f, "graph", "dc", "sum", "_weight"),
    (graph_filter, a_df, f, "graph", "dc", "sum", "_num"),
    (graph_filter, a_df, f, "graph", "dc", "mean", ""),
    (graph_filter, a_df, f, "graph", "dc", "mean", "_weight"),
    (graph_filter, a_df, f, "graph", "dc", "mean", "_num"),
]

p = Pool(10)
for args in a_df_args_list:
    p.apply_async(feature_with_graph, args)
p.close()
p.join()
print('All subprocesses done.')

Task dc_sum runs 62.91 seconds.
Task dc_weight_sum runs 64.62 seconds.
Task dc_num_sum runs 62.13 seconds.
Task dc_mean runs 61.85 seconds.
Task dc_weight_mean runs 61.26 seconds.
Task dc_num_mean runs 62.25 seconds.
All subprocesses done.
