#### 本脚本目的是获得user、item、user-item三个csv文件
#### node_feature:id+feature(feature是user和item的特征concat到一起)
#### action:user_id,item_id
#### 对数据进行筛选

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
USER = pd.read_csv('../data/JData_User.csv', encoding='gbk')
ITEM = pd.read_csv('../data/JData_Product.csv', encoding='gbk')
ACTION = pd.read_csv('../data/JData_Action_201602.csv', encoding='gbk')

## user的处理

In [3]:
user = USER.copy()

In [4]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105321 entries, 0 to 105320
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      105321 non-null  int64  
 1   age          105318 non-null  object 
 2   sex          105318 non-null  float64
 3   user_lv_cd   105321 non-null  int64  
 4   user_reg_tm  105318 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [5]:
def convert_age(age_str):
    if age_str == u'-1':
        return 0
    elif age_str == u'15岁以下':
        return 1
    elif age_str == u'16-25岁':
        return 2
    elif age_str == u'26-35岁':
        return 3
    elif age_str == u'36-45岁':
        return 4
    elif age_str == u'46-55岁':
        return 5
    elif age_str == u'56岁以上':
        return 6
    else:
        return -1
    
def convert_id(id_str):
    return 'u_'+str(id_str)

In [6]:
user['age'] = user['age'].map(convert_age)
user['user_id'] = user['user_id'].map(convert_id)
age_df = pd.get_dummies(user["age"], prefix="age")
sex_df = pd.get_dummies(user["sex"], prefix="sex")
user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
data_user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)

## item处理

In [7]:
item = ITEM.copy()

In [8]:
item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24187 entries, 0 to 24186
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   sku_id  24187 non-null  int64
 1   a1      24187 non-null  int64
 2   a2      24187 non-null  int64
 3   a3      24187 non-null  int64
 4   cate    24187 non-null  int64
 5   brand   24187 non-null  int64
dtypes: int64(6)
memory usage: 1.1 MB


In [9]:
def convert_sku_id(idstr):
    return 'i_'+str(idstr)

In [10]:
item['sku_id'] = item['sku_id'].map(convert_sku_id)
a1_df = pd.get_dummies(item["a1"], prefix="a1")
a2_df = pd.get_dummies(item["a2"], prefix="a2")
a3_df = pd.get_dummies(item["a3"], prefix="a3")
cate_df = pd.get_dummies(item['cate'], prefix='cate')
brand_df = pd.get_dummies(item['brand'], prefix='brand')
data_item = pd.concat([item['sku_id'], a1_df, a2_df, a3_df, cate_df, brand_df], axis=1)

## action处理

In [11]:
action = ACTION.copy()

In [12]:
action.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11485424 entries, 0 to 11485423
Data columns (total 7 columns):
 #   Column    Dtype  
---  ------    -----  
 0   user_id   float64
 1   sku_id    int64  
 2   time      object 
 3   model_id  float64
 4   type      int64  
 5   cate      int64  
 6   brand     int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 613.4+ MB


In [13]:
action = action[action['type']==6]
action.drop(['time','model_id','type','cate','brand'], axis=1, inplace=True)
#筛选用户,去掉点击数比较多的爬虫用户,去掉点击数比较少的冷启用户
#x=action.groupby('user_id').count()['sku_id']
#users=list(x[(x.values>150)&(x.values<200)].index)
action.reset_index()
action = action.drop_duplicates()

In [14]:
def convert_action_user(idstr):
    idstr = int(idstr)
    return 'u_'+ str(idstr)
def convert_action_item(idstr):
    return 'i_'+ str(idstr)

In [15]:
action['user_id'] = action['user_id'].map(convert_action_user)
action['sku_id'] = action['sku_id'].map(convert_action_item)

## 输出文件
将user和item的属性concat到一起，没有的用nan代替

In [16]:
data_user_t = data_user[data_user['user_id'].isin(list(action['user_id']))]
data_user_t.rename(columns={'user_id':'node_id'}, inplace=True)
data_user_t.to_csv('../data/user_features.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
data_item_t = data_item[data_item['sku_id'].isin(list(action['sku_id']))]
data_item_t.rename(columns={'sku_id':'node_id'},inplace=True)
data_item_t.to_csv('../data/item_features.csv', index=False)

In [18]:
node_features = pd.concat([data_user_t, data_item_t], keys='node_id', ignore_index=True)
node_features.fillna(0, inplace=True)

In [19]:
node_features

Unnamed: 0,node_id,age_-1,age_0,age_1,age_2,age_3,age_4,age_5,age_6,sex_0.0,...,brand_855,brand_857,brand_871,brand_875,brand_885,brand_900,brand_905,brand_907,brand_916,brand_922
0,u_200002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,u_200003,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,u_200005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,u_200007,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,u_200008,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72699,i_99890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72700,i_99909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72701,i_99926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72702,i_99955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
node_features.to_csv('../data/node_features.csv', index=False)

In [21]:
action.to_csv('../data/data_action.csv', index=False)