In [1]:
import featuretools as ft
import pandas as pd

In [2]:
offline_df = pd.read_csv('../source/ccf_offline_stage1_train.csv', parse_dates = ['Date_received', 'Date'])

In [27]:
def create_entity(name, user, coupon, merchant, purchase):
    es = ft.EntitySet(id = name)
    es = es.entity_from_dataframe(entity_id = 'coupon', dataframe = coupon, index='Coupon_id', variable_types = {'Coupon_id': ft.variable_types.Index,'Merchant_id': ft.variable_types.Id})

    purchase_variable_types = {'Purchase_id': ft.variable_types.Index,'User_id': ft.variable_types.Id,'Coupon_id': ft.variable_types.Id,}
    es = es.entity_from_dataframe(entity_id = 'purchase', dataframe = purchase, index = 'Purchase_id', variable_types = purchase_variable_types)

    es = es.entity_from_dataframe(entity_id = 'user', dataframe = user, index='User_id',variable_types = {'User_id': ft.variable_types.Index})
    es = es.entity_from_dataframe(entity_id = 'merchant', dataframe = merchant, index='Merchant_id', variable_types = {'Merchant_id': ft.variable_types.Index})
    
    r_user_purchase = ft.Relationship(es['user']['User_id'], es['purchase']['User_id'])
    es = es.add_relationship(r_user_purchase)

    r_purchase_coupon = ft.Relationship(es['coupon']['Coupon_id'], es['purchase']['Coupon_id'])
    es = es.add_relationship(r_purchase_coupon)

    r_merchant_coupon = ft.Relationship(es['merchant']['Merchant_id'], es['coupon']['Merchant_id'])
    es = es.add_relationship(r_merchant_coupon)
    
    return es

In [4]:
def dfs(entity_name, es_raw, agg_primitives_fn=[], trans_primitives_fn=[]):
    return ft.dfs(entityset = es_raw, n_jobs = 4, target_entity = entity_name, max_depth=2, agg_primitives=agg_primitives_fn, trans_primitives=trans_primitives_fn)

In [5]:
purchase = offline_df.copy()
purchase = purchase[purchase['Coupon_id'] > 0]

purchase['Duration'] = purchase['Date'] - purchase['Date_received']
purchase = purchase.reset_index()
purchase = purchase.rename(index=str, columns={"index": "Purchase_id"})

user = purchase[['User_id']].drop_duplicates()
merchant = purchase[['Merchant_id']].drop_duplicates()
coupon = purchase[['Coupon_id', 'Merchant_id', 'Discount_rate']].drop_duplicates()
purchase = purchase[['User_id', 'Coupon_id','Date_received', 'Date', 'Duration', 'Distance']]

es = create_entity('o2o', user, coupon, merchant, purchase)



In [6]:
features, _ = dfs('purchase', es, trans_primitives_fn=['day', 'week', 'month'])

purchase = pd.merge(purchase, features[['DAY(Date_received)', 'WEEK(Date_received)', 'MONTH(Date_received)', 'DAY(Duration)']], on='Purchase_id', how='left')
purchase = purchase.drop(['Duration'], axis=1)
purchase['Distance'] = purchase['Distance'].fillna(-1)
purchase['Distance'] = purchase['Distance'] + 1
purchase = purchase.fillna(0)

purchase['Label'] = purchase.apply(lambda row: 1 if row['DAY(Duration)']< 17 and row['DAY(Duration)']>0 else 0, axis=1)
purchase['Is_used_coupon'] = purchase.apply(lambda row: 1 if row['DAY(Duration)']> 0 else 0, axis=1)

purchase.head(5)

tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


Unnamed: 0,Purchase_id,User_id,Coupon_id,Date_received,Date,Distance,DAY(Date_received),WEEK(Date_received),MONTH(Date_received),DAY(Duration),Label,Is_used_coupon
0,0,1439408,11002.0,2016-05-28,0,2.0,28,21,5,0.0,0,0
1,1,1439408,8591.0,2016-02-17,0,1.0,17,7,2,0.0,0,0
2,2,1439408,1078.0,2016-03-19,0,1.0,19,11,3,0.0,0,0
3,3,1439408,8591.0,2016-06-13,0,1.0,13,24,6,0.0,0,0
4,4,1439408,8591.0,2016-05-16,2016-06-13 00:00:00,1.0,16,20,5,29.0,0,1


In [21]:
coupon['Base_consume'] = 0.0
coupon['Discount'] = 0.0
coupon['Discount_money'] = 0.0
coupon['Coupon_type'] = 0

def cal(row):
    if isinstance(row.Discount_rate, float):
        coupon.Discount = row.Discount_rate
        return row
    
    arr = row.Discount_rate.split(':')
    if len(arr) == 2:
        row.Discount =  (float(arr[0]) - float(arr[1])) / float(arr[0])
        row.Coupon_type = 1
        row.Base_consume = float(arr[0])
        row.Discount_money = float(arr[1])
    else:
        coupon.Discount = float(row.Discount_rate)
            
    return row

coupon = coupon.apply(lambda row: cal(row), axis=1)
coupon = coupon.drop(['Discount_rate'], axis=1)
coupon.head(10)



Unnamed: 0,Coupon_id,Merchant_id,Base_consume,Discount,Discount_money,Coupon_type
0,11002.0,4663,150.0,0.866667,20.0,1
1,8591.0,2632,20.0,0.95,1.0,1
2,1078.0,2632,20.0,0.95,1.0,1
5,7610.0,3381,200.0,0.9,20.0,1
6,11951.0,3381,200.0,0.9,20.0,1
7,1532.0,450,30.0,0.833333,5.0,1
8,12737.0,6459,20.0,0.95,1.0,1
9,1097.0,6901,50.0,0.8,10.0,1
10,10698.0,1579,20.0,0.95,1.0,1
11,9776.0,3381,10.0,0.5,5.0,1


### 用户特征

In [28]:
es = create_entity('o2o_base', user, coupon, merchant, purchase)

In [29]:
user_features, user_feature_names = dfs('user', es, agg_primitives_fn = ['min', 'max', 'mean', 'mode'])

Exception ignored in: <generator object Scheduler.add_client at 0x10cc2fe58>
RuntimeError: generator ignored GeneratorExit
tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


In [30]:
user_features.columns.values

array(['MIN(purchase.Distance)', 'MIN(purchase.DAY(Date_received))',
       'MIN(purchase.WEEK(Date_received))',
       'MIN(purchase.MONTH(Date_received))',
       'MIN(purchase.DAY(Duration))', 'MIN(purchase.Label)',
       'MIN(purchase.Is_used_coupon)', 'MAX(purchase.Distance)',
       'MAX(purchase.DAY(Date_received))',
       'MAX(purchase.WEEK(Date_received))',
       'MAX(purchase.MONTH(Date_received))',
       'MAX(purchase.DAY(Duration))', 'MAX(purchase.Label)',
       'MAX(purchase.Is_used_coupon)', 'MEAN(purchase.Distance)',
       'MEAN(purchase.DAY(Date_received))',
       'MEAN(purchase.WEEK(Date_received))',
       'MEAN(purchase.MONTH(Date_received))',
       'MEAN(purchase.DAY(Duration))', 'MEAN(purchase.Label)',
       'MEAN(purchase.Is_used_coupon)', 'MODE(purchase.Date)',
       'MODE(purchase.Coupon_id)', 'MIN(purchase.coupon.Base_consume)',
       'MIN(purchase.coupon.Discount)',
       'MIN(purchase.coupon.Discount_money)',
       'MIN(purchase.coupon.Coupon_typ

In [20]:
user_features_df = user.copy()

columns = [
    'MIN(purchase.Distance)', 
    'MIN(purchase.DAY(Date_received))',
    'MIN(purchase.WEEK(Date_received))',
    'MIN(purchase.MONTH(Date_received))',
    'MIN(purchase.DAY(Duration))', 'MIN(purchase.Label)',
    'MIN(purchase.Is_used_coupon)', 'MAX(purchase.Distance)',
    'MAX(purchase.DAY(Date_received))',
    'MAX(purchase.WEEK(Date_received))',
    'MAX(purchase.MONTH(Date_received))',
    'MAX(purchase.DAY(Duration))', 'MAX(purchase.Label)',
    'MAX(purchase.Is_used_coupon)', 'MEAN(purchase.Distance)',
    'MEAN(purchase.DAY(Date_received))',
    'MEAN(purchase.WEEK(Date_received))',
    'MEAN(purchase.MONTH(Date_received))',
    'MEAN(purchase.DAY(Duration))', 'MEAN(purchase.Label)',
    'MEAN(purchase.Is_used_coupon)', 'MODE(purchase.Date)',
    'MODE(purchase.Coupon_id)', 'MIN(purchase.coupon.Base_consume)',
    'MIN(purchase.coupon.Discount)',
    'MIN(purchase.coupon.Discount_money)',
    'MIN(purchase.coupon.Coupon_type)',
    'MAX(purchase.coupon.Base_consume)',
    'MAX(purchase.coupon.Discount)',
    'MAX(purchase.coupon.Discount_money)',
    'MAX(purchase.coupon.Coupon_type)',
    'MEAN(purchase.coupon.Base_consume)',
    'MEAN(purchase.coupon.Discount)',
    'MEAN(purchase.coupon.Discount_money)',
    'MEAN(purchase.coupon.Coupon_type)',
    'MODE(purchase.coupon.Discount_rate)',
    'MODE(purchase.coupon.Merchant_id)'
]

user_features_df = pd.merge(user_features_df, features[columns], on='User_id', how='left')

array(['MIN(purchase.Distance)', 'MIN(purchase.DAY(Date_received))',
       'MIN(purchase.WEEK(Date_received))',
       'MIN(purchase.MONTH(Date_received))',
       'MIN(purchase.DAY(Duration))', 'MIN(purchase.Label)',
       'MIN(purchase.Is_used_coupon)', 'MAX(purchase.Distance)',
       'MAX(purchase.DAY(Date_received))',
       'MAX(purchase.WEEK(Date_received))',
       'MAX(purchase.MONTH(Date_received))',
       'MAX(purchase.DAY(Duration))', 'MAX(purchase.Label)',
       'MAX(purchase.Is_used_coupon)', 'MEAN(purchase.Distance)',
       'MEAN(purchase.DAY(Date_received))',
       'MEAN(purchase.WEEK(Date_received))',
       'MEAN(purchase.MONTH(Date_received))',
       'MEAN(purchase.DAY(Duration))', 'MEAN(purchase.Label)',
       'MEAN(purchase.Is_used_coupon)', 'MODE(purchase.Date)',
       'MODE(purchase.Coupon_id)', 'MIN(purchase.coupon.Base_consume)',
       'MIN(purchase.coupon.Discount)',
       'MIN(purchase.coupon.Discount_money)',
       'MIN(purchase.coupon.Coupon_typ

### 抽取特征

In [5]:
entity_name = 'user'
agg_primitives_fn = ['min', 'max', 'mean', 'mode']
trans_primitives_fn = []

features, feature_names = ft.dfs(entityset = es, n_jobs = 4, target_entity = entity_name, max_depth=2, agg_primitives=agg_primitives_fn, trans_primitives=trans_primitives_fn)

Exception ignored in: <generator object Scheduler.add_client at 0x124a88c78>
RuntimeError: generator ignored GeneratorExit
tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


In [6]:
user_features_df = user.copy()

columns = [
    'MIN(purchase.Month_in_year)', 
    'MIN(purchase.Day_in_month)',
    'MIN(purchase.Day_in_week)', 
    'MIN(purchase.Distance)', 
    'MAX(purchase.Month_in_year)',
    'MAX(purchase.Day_in_month)',
    'MAX(purchase.Day_in_week)', 
    'MAX(purchase.Distance)', 
    'MEAN(purchase.Month_in_year)', 
    'MEAN(purchase.Day_in_month)',
    'MEAN(purchase.Day_in_week)', 
    'MEAN(purchase.Distance)', 
    'MIN(purchase.coupon.Discount)',
    'MIN(purchase.coupon.Base_consume)',
    'MIN(purchase.coupon.Discount_money)',
    'MAX(purchase.coupon.Discount)',
    'MAX(purchase.coupon.Base_consume)',
    'MAX(purchase.coupon.Discount_money)',
    'MEAN(purchase.coupon.Discount)',
    'MEAN(purchase.coupon.Base_consume)',
    'MEAN(purchase.coupon.Discount_money)',
]

user_features_df = pd.merge(user_features_df, features[columns], on='User_id', how='left')

In [7]:
entity_name = 'user'
agg_primitives_fn = ['sum', 'count']
trans_primitives_fn = ['divide', 'negate']

features, feature_names = ft.dfs(entityset = es, n_jobs = 4, target_entity = entity_name, max_depth=2, agg_primitives=agg_primitives_fn, trans_primitives=trans_primitives_fn)

tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


In [8]:
columns = [
    'SUM(purchase.Is_used_coupon)',
    'SUM(purchase.Is_in_day_consume)',
    'COUNT(purchase)',
    'SUM(purchase.Is_in_day_consume) / COUNT(purchase)',
    'SUM(purchase.Is_used_coupon) / COUNT(purchase)',
    'SUM(purchase.Is_in_day_consume) / SUM(purchase.Is_used_coupon)'
]

user_features_df = pd.merge(user_features_df, features[columns], on='User_id', how='left')

user_features_df['COUNT(purchase) - SUM(purchase.Is_used_coupon)'] = user_features_df['COUNT(purchase)'] - user_features_df['SUM(purchase.Is_used_coupon)']
user_features_df['COUNT(purchase) - SUM(purchase.Is_in_day_consume)'] = user_features_df['COUNT(purchase)'] - user_features_df['SUM(purchase.Is_in_day_consume)']
user_features_df['SUM(purchase.Is_used_coupon) - SUM(purchase.Is_in_day_consume)'] = user_features_df['SUM(purchase.Is_used_coupon)'] - user_features_df['SUM(purchase.Is_in_day_consume)']

user_features_df['(COUNT(purchase) - SUM(purchase.Is_used_coupon))/COUNT(purchase)'] = user_features_df['COUNT(purchase) - SUM(purchase.Is_used_coupon)'] / user_features_df['COUNT(purchase)']
user_features_df['(COUNT(purchase) - SUM(purchase.Is_in_day_consume))/COUNT(purchase)'] = user_features_df['COUNT(purchase) - SUM(purchase.Is_in_day_consume)'] / user_features_df['COUNT(purchase)']
user_features_df['(SUM(purchase.Is_used_coupon) - SUM(purchase.Is_in_day_consume))/SUM(purchase.Is_used_coupon)'] = user_features_df['SUM(purchase.Is_used_coupon) - SUM(purchase.Is_in_day_consume)'] / user_features_df['SUM(purchase.Is_used_coupon)']

In [16]:
user_columns_df = pd.DataFrame()
user_columns_df['column_name'] = user_features_df.columns.values
user_columns_df['type'] = user_features_df.dtypes.values
user_columns_df

Unnamed: 0,column_name,type
0,User_id,int64
1,MIN(purchase.Month_in_year),float64
2,MIN(purchase.Day_in_month),float64
3,MIN(purchase.Day_in_week),float64
4,MIN(purchase.Distance),float64
5,MAX(purchase.Month_in_year),float64
6,MAX(purchase.Day_in_month),float64
7,MAX(purchase.Day_in_week),float64
8,MAX(purchase.Distance),float64
9,MEAN(purchase.Month_in_year),float64


In [16]:
user = agg('user', user, 'User_id', 'User:')
# user = user.drop([
#     'User:SUM(purchase.Month_in_year)',
#     'User:SUM(purchase.Day_in_month)',
#     'User:SUM(purchase.Day_in_week)',
#     'User:SUM(purchase.Distance)',
#     'User:MEAN(purchase.Month_in_year)',
#     'User:MEAN(purchase.Day_in_month)',
#     'User:MEAN(purchase.Day_in_week)',
#     'User:MODE(purchase.Coupon_id)',
#     'User:SUM(purchase.coupon.Discount)',
#     'User:SUM(purchase.coupon.Coupon_type)',
#     'User:SUM(purchase.coupon.Base_consume)',
#     'User:SUM(purchase.coupon.Discount_money)',
#     'User:MEAN(purchase.coupon.Coupon_type)',
#     'User:MODE(purchase.coupon.Merchant_id)'
# ], axis=1)

Future exception was never retrieved
future: <Future finished exception=StreamClosedError('Stream is closed')>
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/tcpclient.py", line 112, in on_connect_done
    stream = future.result()
tornado.iostream.StreamClosedError: Stream is closed
Exception ignored in: <generator object Scheduler.add_client at 0x13ba43750>
RuntimeError: generator ignored GeneratorExit
tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


In [None]:
user.columns.values

In [11]:
merchant = agg('merchant', merchant, 'Merchant_id', 'Merchant:')
merchant = merchant.drop([
    'Merchant:SUM(coupon.Discount)',
    'Merchant:SUM(coupon.Coupon_type)',
    'Merchant:SUM(coupon.Base_consume)',
    'Merchant:SUM(coupon.Discount_money)',
    'Merchant:MEAN(coupon.Coupon_type)',
    'Merchant:SUM(purchase.Month_in_year)',
    'Merchant:SUM(purchase.Day_in_month)',
    'Merchant:SUM(purchase.Day_in_week)',
    'Merchant:SUM(purchase.Distance)',
    'Merchant:MEAN(purchase.Month_in_year)',
    'Merchant:MEAN(purchase.Day_in_month)',
    'Merchant:MEAN(purchase.Day_in_week)',
    'Merchant:SUM(coupon.MIN(purchase.Month_in_year))',
    'Merchant:SUM(coupon.MIN(purchase.Day_in_month))',
    'Merchant:SUM(coupon.MIN(purchase.Day_in_week))',
    'Merchant:SUM(coupon.MIN(purchase.Distance))',
    'Merchant:SUM(coupon.MAX(purchase.Month_in_year))',
    'Merchant:SUM(coupon.MAX(purchase.Day_in_month))',
    'Merchant:SUM(coupon.MAX(purchase.Day_in_week))',
    'Merchant:SUM(coupon.MAX(purchase.Distance))',
    'Merchant:SUM(coupon.MEAN(purchase.Month_in_year))',
    'Merchant:SUM(coupon.MEAN(purchase.Day_in_month))',
    'Merchant:SUM(coupon.MEAN(purchase.Day_in_week))',
    'Merchant:SUM(coupon.MEAN(purchase.Distance))',
    'Merchant:MEAN(coupon.SUM(purchase.Month_in_year))',
    'Merchant:MEAN(coupon.SUM(purchase.Day_in_month))',
    'Merchant:MEAN(coupon.SUM(purchase.Day_in_week))',
    'Merchant:MEAN(coupon.SUM(purchase.Distance))',
    'Merchant:MEAN(coupon.MIN(purchase.Month_in_year))',
    'Merchant:MEAN(coupon.MIN(purchase.Day_in_month))',
    'Merchant:MEAN(coupon.MIN(purchase.Day_in_week))',
    'Merchant:MEAN(coupon.MAX(purchase.Month_in_year))',
    'Merchant:MEAN(coupon.MAX(purchase.Day_in_month))',
    'Merchant:MEAN(coupon.MAX(purchase.Day_in_week))',
    'Merchant:MEAN(coupon.MEAN(purchase.Month_in_year))',
    'Merchant:MEAN(coupon.MEAN(purchase.Day_in_month))',
    'Merchant:MEAN(coupon.MEAN(purchase.Day_in_week))',
    'Merchant:MODE(coupon.MODE(purchase.User_id))'
], axis=1)

tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


In [12]:
coupon = agg('coupon', coupon, 'Coupon_id', 'Coupon:')
coupon = coupon.drop([
    'Coupon:SUM(purchase.Month_in_year)',
    'Coupon:SUM(purchase.Day_in_month)',
    'Coupon:SUM(purchase.Day_in_week)',
    'Coupon:SUM(purchase.Distance)',
    'Coupon:MEAN(purchase.Month_in_year)',
    'Coupon:MEAN(purchase.Day_in_month)',
    'Coupon:MEAN(purchase.Day_in_week)',
    'Coupon:MODE(purchase.User_id)',
    'Coupon:merchant.SUM(coupon.Discount)',
    'Coupon:merchant.SUM(coupon.Coupon_type)',
    'Coupon:merchant.SUM(coupon.Base_consume)',
    'Coupon:merchant.SUM(coupon.Discount_money)',
    'Coupon:merchant.MEAN(coupon.Coupon_type)',
    'Coupon:merchant.SUM(purchase.Month_in_year)',
    'Coupon:merchant.SUM(purchase.Day_in_month)',
    'Coupon:merchant.SUM(purchase.Day_in_week)',
    'Coupon:merchant.SUM(purchase.Distance)',
    'Coupon:merchant.MEAN(purchase.Month_in_year)',
    'Coupon:merchant.MEAN(purchase.Day_in_month)',
    'Coupon:merchant.MEAN(purchase.Day_in_week)',
    'Coupon:merchant.MODE(purchase.User_id)'
], axis=1)

tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


In [13]:
purchase = agg('purchase', _, _, 'Purchase:', is_merge=False)
purchase = purchase.drop([
    'Purchase:user.SUM(purchase.Month_in_year)',
    'Purchase:user.SUM(purchase.Day_in_month)',
    'Purchase:user.SUM(purchase.Day_in_week)',
    'Purchase:user.SUM(purchase.Distance)',
    'Purchase:user.MEAN(purchase.Month_in_year)',
    'Purchase:user.MEAN(purchase.Day_in_month)',
    'Purchase:user.MEAN(purchase.Day_in_week)',
    'Purchase:user.MODE(purchase.Coupon_id)',
    'Purchase:coupon.SUM(purchase.Month_in_year)',
    'Purchase:coupon.SUM(purchase.Day_in_month)',
    'Purchase:coupon.SUM(purchase.Day_in_week)',
    'Purchase:coupon.SUM(purchase.Distance)',
    'Purchase:coupon.MEAN(purchase.Month_in_year)',
    'Purchase:coupon.MEAN(purchase.Day_in_month)',
    'Purchase:coupon.MEAN(purchase.Day_in_week)',
    'Purchase:coupon.MODE(purchase.User_id)'
], axis=1)

Exception ignored in: <generator object Scheduler.add_client at 0x10ec58660>
RuntimeError: generator ignored GeneratorExit
tornado.application - ERROR - Exception in Future <Future cancelled> after timeout
Traceback (most recent call last):
  File "/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/tornado/gen.py", line 970, in error_callback
    future.result()
concurrent.futures._base.CancelledError


In [14]:
df = purchase.copy()
df = df.rename(index=str, columns={
    'Purchase:User_id': 'User_id',
    'Purchase:Coupon_id': 'Coupon_id',
    'Purchase:coupon.Merchant_id': 'Merchant_id'
})

df = pd.merge(df, user, on='User_id', how='left')
df = pd.merge(df, merchant, on='Merchant_id', how='left')
df = pd.merge(df, coupon, on='Coupon_id', how='left')
df['Label'] = label.values
df = df[df['Purchase:Month_in_year'] >= 5]
df.to_csv('../features/lcm_valid_dataset.csv', index=False, header=True)

In [15]:
df.head(10)

Unnamed: 0,Purchase:Month_in_year,Purchase:Day_in_month,Purchase:Day_in_week,Purchase:Distance,User_id,Coupon_id,Purchase:coupon.Discount,Purchase:coupon.Coupon_type,Purchase:coupon.Base_consume,Purchase:coupon.Discount_money,...,Coupon:merchant.MIN(purchase.Month_in_year),Coupon:merchant.MIN(purchase.Day_in_month),Coupon:merchant.MIN(purchase.Day_in_week),Coupon:merchant.MIN(purchase.Distance),Coupon:merchant.MAX(purchase.Month_in_year),Coupon:merchant.MAX(purchase.Day_in_month),Coupon:merchant.MAX(purchase.Day_in_week),Coupon:merchant.MAX(purchase.Distance),Coupon:merchant.MEAN(purchase.Distance),Label
0,5.0,28.0,6.0,2.0,1439408,11002.0,0.866667,1.0,150.0,20.0,...,1.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,4.116911,0.0
3,6.0,13.0,1.0,1.0,1439408,8591.0,0.95,1.0,20.0,1.0,...,1.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,1.906977,0.0
4,5.0,16.0,1.0,1.0,1439408,8591.0,0.95,1.0,20.0,1.0,...,1.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,1.906977,0.0
7,5.0,30.0,1.0,1.0,2029232,1532.0,0.833333,1.0,30.0,5.0,...,1.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,4.766949,0.0
8,5.0,19.0,4.0,1.0,2029232,12737.0,0.95,1.0,20.0,1.0,...,5.0,1.0,1.0,0.0,6.0,27.0,7.0,5.0,1.5,0.0
9,6.0,6.0,1.0,0.0,2747744,1097.0,0.8,1.0,50.0,10.0,...,3.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,1.997191,0.0
10,6.0,6.0,1.0,2.0,196342,10698.0,0.95,1.0,20.0,1.0,...,4.0,2.0,1.0,0.0,6.0,29.0,7.0,6.0,2.0,0.0
16,5.0,18.0,3.0,1.0,253750,2366.0,0.833333,1.0,30.0,5.0,...,3.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,1.997191,0.0
18,5.0,28.0,6.0,0.0,343660,11002.0,0.866667,1.0,150.0,20.0,...,1.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,4.116911,0.0
21,5.0,24.0,2.0,1.0,1113008,2705.0,0.75,1.0,20.0,5.0,...,1.0,1.0,1.0,0.0,6.0,31.0,7.0,11.0,1.699119,0.0


In [12]:
# List the primitives in a dataframe
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100

primitives[primitives['type'] == 'transform']

Unnamed: 0,name,type,description
19,diff,transform,Compute the difference between the value of a base feature and the previous value.
20,weekday,transform,Transform Datetime feature into the boolean of Weekday.
21,second,transform,Transform a Datetime feature into the second.
22,and,transform,"For two boolean values, determine if both values are 'True'."
23,days_since,transform,"For each value of the base feature, compute the number of days between it"
24,years,transform,Transform a Timedelta feature into the number of years.
25,hour,transform,Transform a Datetime feature into the hour.
26,months,transform,Transform a Timedelta feature into the number of months.
27,day,transform,Transform a Datetime feature into the day.
28,haversine,transform,Calculate the approximate haversine distance in miles between two LatLong variable types.
