In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('export.csv', header = 0, 
                   dtype = {'u1_agentid':str,
                            'u1_reference':str,
                            'u1_in_degree':float,
                            'u1_out_degree':float,
                            'u1_page_rank':float,
                            'u1_community_id':str,
                            'u1_category':str,
                            'u1_status':str,
                            'u1_user_type':str,
                            'tranid':str,
                            'amount':float,
                            'tran_type':str,
                            'u2_agentid':str,
                            'u2_reference':str,
                            'u2_in_degree':float,
                            'u2_out_degree':float,
                            'u2_page_rank':float,
                            'u2_community_id':str,
                            'u2_category':str,
                            'u2_status':str,
                            'u2_user_type':str,
                           })

In [3]:
# Missing value treatment for u1
data['u1_in_degree'] = data['u1_in_degree'].fillna(0)
data['u1_out_degree'] = data['u1_out_degree'].fillna(0)
data['u1_page_rank'] = data['u1_page_rank'].fillna(0)
data['u1_community_id'] = data['u1_community_id'].fillna('unknown')
data['u1_category'] = data['u1_category'].fillna('unknown')
data['u1_status'] = data['u1_status'].fillna('unknown')
data['u1_user_type'] = data['u1_user_type'].fillna('unknown')
# Missing value treatment for u1
data['u2_in_degree'] = data['u2_in_degree'].fillna(0)
data['u2_out_degree'] = data['u2_out_degree'].fillna(0)
data['u2_page_rank'] = data['u2_page_rank'].fillna(0)
data['u2_community_id'] = data['u2_community_id'].fillna('unknown')
data['u2_category'] = data['u2_category'].fillna('unknown')
data['u2_status'] = data['u2_status'].fillna('unknown')
data['u2_user_type'] = data['u2_user_type'].fillna('unknown')

In [4]:
u1 = pd.DataFrame(data[['u1_agentid',
                         'u1_in_degree',
                         'u1_out_degree',
                         'u1_page_rank',
                         'u1_community_id',
                         'u1_category',
                         'u1_status',
                         'u1_user_type'
                        ]])
u1.columns = u1.columns.str[3:]
u2 = pd.DataFrame(data[['u2_agentid',
                         'u2_in_degree',
                         'u2_out_degree',
                         'u2_page_rank',
                         'u2_community_id',
                         'u2_category',
                         'u2_status',
                         'u2_user_type'
                        ]])
u2.columns = u2.columns.str[3:]

### Extract user set

In [5]:
agent = pd.concat([u1, u2]).drop_duplicates(subset=['agentid'], keep='first')

In [6]:
agent.head()

Unnamed: 0,agentid,in_degree,out_degree,page_rank,community_id,category,status,user_type
0,46695821,0.0,0.0,0.0,unknown,agent,3,0
2,28379698,0.0,0.0,0.0,unknown,agent,3,0
3,38138719,0.0,0.0,0.0,unknown,agent,3,1
4,49597729,0.0,0.0,0.0,unknown,agent,3,0
5,36266096,0.0,0.0,0.0,unknown,agent,3,1


### Extract transaction

In [7]:
transaction = pd.DataFrame(data[['u1_agentid',
                         'u1_reference',
                         'tranid',
                         'amount',
                         'tran_type',
                         'u2_agentid',
                         'u2_reference'
                        ]])


In [8]:
transaction.head()

Unnamed: 0,u1_agentid,u1_reference,tranid,amount,tran_type,u2_agentid,u2_reference
0,46695821,vttimathe_imedia,6009421601,2800.0,cashback,38157836,906039753
1,46695821,vttimathe_imedia,6009479223,28000.0,billpay,38157836,906039753
2,28379698,cktopupmm_mbp,5969281259,1750.0,bonus,38157836,906039753
3,38138719,vms2.airtime,5969158393,50000.0,buy,38157836,906039753
4,49597729,m4becomrecess,6017694536,35316.0,m4bpay,4590187,979726690


### Extract product set

In [12]:
product = pd.DataFrame(transaction['tran_type'].unique()).rename(columns = {0: 'product'})

In [14]:
product.head()

Unnamed: 0,product
0,cashback
1,billpay
2,bonus
3,buy
4,m4bpay


### Write CSV

In [9]:
transaction.to_csv('transformed_data/transaction.csv', index = 0)
agent.to_csv('transformed_data/agent.csv', index = 0)