In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.model_selection import GridSearchCV
from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector
from pyfm import pylibfm
from sklearn.impute import SimpleImputer
from datetime import datetime
from sklearn.decomposition import PCA

In [2]:
offline_df = pd.read_csv('ccf_offline_stage1_train.csv')

In [3]:
train_dataset = offline_df[offline_df['Date_received']<20160501]

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class DayInMonth4ReceivedDayExtractor(TransformerMixin):
    def get_day_in_month_4_received_day(self, received_date):
        if math.isnan(received_date) or isinstance(received_date, int) or float(received_date) <= 0:
            return 0.0

        date_received = datetime.strptime(str(int(received_date)), '%Y%m%d')
        return date_received.day
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X['Date_received'].apply(lambda row: self.get_day_in_month_4_received_day(row)))

In [3]:
class DayInWeek4ReceivedDayExtractor(TransformerMixin):
    def get_day_in_week_4_received_day(self, received_date):
        if math.isnan(received_date) or isinstance(received_date, int) or float(received_date) <= 0:
            return 0.0

        date_received = datetime.strptime(str(int(received_date)), '%Y%m%d')
        return (date_received.weekday() + 1)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X['Date_received'].apply(lambda row: self.get_day_in_week_4_received_day(row)))

In [4]:
class BaseConsumeExtractor(TransformerMixin):
    def base_consume(self, discount_rate):
        if isinstance(discount_rate, int):
            return float(discount_rate)

        if isinstance(discount_rate, float):
            return discount_rate

        if discount_rate == 'fixed':
            return 0.0

        arr = discount_rate.split(':')
        if len(arr) == 2:
            return float(arr[0])
        else:
            return 0.0
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X['Discount_rate'].apply(lambda row: self.base_consume(row)))

In [5]:
class DiscountExtractor(TransformerMixin):
    def cal_discount(self, discount_rate):
        if isinstance(discount_rate, int):
            return float(discount_rate)

        if isinstance(discount_rate, float):
            return discount_rate

        if discount_rate == 'fixed':
            return 0.0

        arr = discount_rate.split(':')
        if len(arr) == 2:
            return (float(arr[0]) - float(arr[1])) / float(arr[0])
        else:
            return float(discount_rate)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X['Discount_rate'].apply(lambda row: self.cal_discount(row)))

In [6]:
class CouponTypeExtractor(TransformerMixin):
    def set_coupon_type(self, discount_rate):
        if isinstance(discount_rate, int):
            return 1

        if isinstance(discount_rate, float):
            return 1

        if discount_rate == 'fixed':
            return 2

        arr = discount_rate.split(':')
        if len(arr) == 2:
            return 1
        else:
            return 0
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return pd.DataFrame(X['Discount_rate'].apply(lambda row: self.set_coupon_type(row)))

In [7]:
class LabelExtractor(TransformerMixin):
    def check_is_in_day_consume(self, row):
        if row['Coupon_id'] == 'fixed':
            return 0

        if float(row['Coupon_id']) > 0 and float(row['Date_received']) > 0 and float(row['Date']) > 0:
            date_received = datetime.strptime(str(int(row['Date_received'])), '%Y%m%d')
            date_consumed = datetime.strptime(str(int(row['Date'])), '%Y%m%d')
            delta = date_consumed - date_received
            if delta.days < 16:
                return 1
            else:
                return 0

        return 0
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return X.apply(lambda row: self.check_is_in_day_consume(row), axis=1)

In [8]:
from sklearn.feature_extraction import DictVectorizer

class DictTransformer(TransformerMixin):
    
    def __init__(self):
        self.vectorizer = DictVectorizer()
    
    def fit(self, X, y=None, **fit_params):
        self.vectorizer.fit(pd.DataFrame(X).T.to_dict().values())
        return self
    
    def transform(self, X, **transform_params):
        return self.vectorizer.transform(pd.DataFrame(X).T.to_dict().values())

### 使用Pipeline构建特征的抽取和模型训练过程

In [11]:
FACTOR_FIELDS = ['Distance']

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('base_consume', BaseConsumeExtractor()),
            ('discount', DiscountExtractor()),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
            ('scale', Normalizer())
        ])),
        ('distance', Pipeline([
            ('extract', ColumnSelector(FACTOR_FIELDS)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
        ('coupon_type', Pipeline([
            ('extract', CouponTypeExtractor()),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
        ('day_in_week', Pipeline([
            ('extract', DayInWeek4ReceivedDayExtractor()),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
        ('day_in_month', Pipeline([
            ('extract', DayInMonth4ReceivedDayExtractor()),
            ('one_hot', OneHotEncoder(categories='auto')),
            ('to_dense', DenseTransformer())
        ])),
    ])),
    ('dict', DictTransformer()),
    ('fm', pylibfm.FM(num_factors=128, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal"))
])

labelextractor = LabelExtractor()
label_dataset = labelextractor.transform(train_dataset)

pipeline.fit(train_dataset, label_dataset)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.18203
-- Epoch 2
Training log loss: 0.16962
-- Epoch 3
Training log loss: 0.16803
-- Epoch 4
Training log loss: 0.16726
-- Epoch 5
Training log loss: 0.16680
-- Epoch 6
Training log loss: 0.16653
-- Epoch 7
Training log loss: 0.16634
-- Epoch 8
Training log loss: 0.16622
-- Epoch 9
Training log loss: 0.16612
-- Epoch 10
Training log loss: 0.16605


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('continuous', Pipeline(memory=None,
     steps=[('base_consume', <__main__.BaseConsumeExtractor object at 0x10d986400>), ('discount', <__main__.DiscountExtractor object at 0x10d986470>), ('imputer', SimpleImputer(copy=True, fill... <__main__.DictTransformer object at 0x10d986940>), ('fm', <pyfm.pylibfm.FM object at 0x10d9869b0>)])

In [12]:
test_dataset = offline_df[offline_df['Date_received']>=20160501]
test_dataset = test_dataset[test_dataset['Coupon_id']>0]
test_label_dataset = labelextractor.transform(test_dataset)

prediction = pipeline.predict(test_dataset)

### 使用特征工程训练好的数据，直接用Pipeline构建模型训练

In [9]:
train_dataset = pd.read_csv('lcm_train_features.csv')

In [10]:
LABELS = ['Is_in_day_consume']
FEATURES = ['Distance','Previous_duration',
       'Next_duration', 'Base_consume', 'Day_in_month_received',
       'Day_in_week_received', 'Discount', 'Coupon_type', 'User_receive_count', 'User_consume_count',
       'User_used_count', 'User_not_used_count', 'User_used_coupon_rate',
       'User_used_coupon_rate_max', 'User_used_coupon_rate_min',
       'User_used_coupon_rate_mean', 'User_receive_coupon_merchant_count',
       'User_consume_merchant_count', 'User_used_coupon_merchant_count',
       'User_used_coupon_merchant_occ',
       'User_receive_different_coupon_count',
       'User_used_different_coupon_count',
       'User_receive_different_coupon_occ',
       'User_used_different_coupon_occ', 'User_receive_coupon_mean',
       'User_used_coupon_mean', 'User_distance_used_mean',
       'User_distance_used_max', 'User_distance_used_min',
       'User_duration_used_mean', 'User_duration_used_max',
       'User_duration_used_min', 'User_previous_duration_used_mean',
       'User_previous_duration_used_max',
       'User_previous_duration_used_min', 'User_next_duration_used_mean',
       'User_next_duration_used_max', 'User_next_duration_used_min',
       'Merchant_receive_count', 'Merchant_consume_count',
       'Merchant_used_count', 'Merchant_not_used_count',
       'Merchant_used_coupon_rate', 'Merchant_used_coupon_rate_max',
       'Merchant_used_coupon_rate_min', 'Merchant_used_coupon_rate_mean',
       'Merchant_receive_coupon_user_count',
       'Merchant_consume_user_count', 'Merchant_used_coupon_user_count',
       'Merchant_receive_coupon_user_occ', 'Merchant_consume_user_occ',
       'Merchant_used_coupon_user_occ',
       'Merchant_receive_different_coupon_count',
       'Merchant_used_different_coupon_count',
       'Merchant_receive_different_coupon_occ',
       'Merchant_used_different_coupon_occ',
       'Merchant_receive_coupon_mean', 'Merchant_used_coupon_mean',
       'Merchant_receive_different_coupon_avg',
       'Merchant_used_different_coupon_avg',
       'Merchant_distance_used_mean', 'Merchant_distance_used_max',
       'Merchant_distance_used_min', 'Merchant_duration_used_mean',
       'Merchant_duration_used_max', 'Merchant_duration_used_min',
       'Merchant_previous_duration_used_mean',
       'Merchant_previous_duration_used_max',
       'Merchant_previous_duration_used_min',
       'Merchant_next_duration_used_mean',
       'Merchant_next_duration_used_max',
       'Merchant_next_duration_used_min', 'Coupon_received_count',
       'Coupon_used_count', 'Coupon_used_rate',
       'Coupon_duration_used_mean', 'Coupon_duration_used_max',
       'Coupon_duration_used_min', 'Coupon_distance_used_mean',
       'Coupon_distance_used_max', 'Coupon_distance_used_min',
       'User_merchant_receive_count', 'User_merchant_consume_count',
       'User_merchant_used_count', 'User_merchant_not_used_count',
       'User_merchant_used_coupon_rate',
       'User_merchant_not_used_coupon_rate',
       'User_merchant_used_coupon_rate_4_merchant',
       'User_merchant_not_used_coupon_rate_4_merchant',
       'User_merchant_duration_used_mean',
       'User_merchant_duration_used_max',
       'User_merchant_duration_used_min', 'Online_user_receive_count',
       'Online_user_consume_count', 'Online_user_used_count',
       'Online_user_not_used_count', 'Online_user_used_coupon_rate',
       'User_offline_consume_rate', 'User_offline_used_rate',
       'User_offline_no_consume_coupon_rate',
       'User_distance_receive_count', 'User_distance_consume_count',
       'User_distance_used_count', 'User_distance_receive_rate',
       'User_distance_consume_rate', 'User_distance_used_rate',
       'User_coupon_type_receive_count', 'User_coupon_type_used_count',
       'User_coupon_type_receive_rate', 'User_coupon_type_used_rate',
       'User_coupon_receive_count', 'User_coupon_used_count',
       'User_coupon_receive_rate', 'User_coupon_used_rate',
       'Merchant_distance_receive_count',
       'Merchant_distance_consume_count', 'Merchant_distance_used_count',
       'Merchant_distance_receive_rate', 'Merchant_distance_used_rate',
       'User_coupon_duration_used_mean', 'User_coupon_duration_used_max',
       'User_coupon_duration_used_min', 'User_received_date_count']

pipeline = Pipeline([
#     ('features', FeatureUnion([
#         ('extract', ColumnSelector(FEATURES)),
#         ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
#     ])),
    ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
    ('dict', DictVectorizer()),
    ('fm', pylibfm.FM(num_factors=10, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal"))
])

features_dict = train_dataset[FEATURES].T.to_dict().values()

In [None]:
# Train
pipeline.fit(features_dict, train_dataset[LABELS].values)

In [1]:
import ffm

ffm_dataset = ffm.FFMData(train_dataset[FEATURES], train_dataset[LABELS])
ffm_test_dataset = ffm.FFMData(train_dataset[FEATURES], train_dataset[LABELS])
    
# train the model for 10 iterations 
clf = ffm.FFM(eta=0.1, lam=0.0001, k=4)
clf.fit(ffm_data,num_iter=10, val_data=ffm_data_test, metric='auc', early_stopping=6, maximum=True) 

OSError: dlopen(/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/ffm-1.0-py3.7-macosx-10.14-x86_64.egg/ffm/libffm.py, 6): no suitable image found.  Did find:
	/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/ffm-1.0-py3.7-macosx-10.14-x86_64.egg/ffm/libffm.py: file too short
	/Users/leewind/.local/share/virtualenvs/leewind-p6XO93Th/lib/python3.7/site-packages/ffm-1.0-py3.7-macosx-10.14-x86_64.egg/ffm/libffm.py: file too short

In [11]:
test_dataset = pd.read_csv('lcm_train_test_features.csv')
test_dataset = test_dataset[test_dataset['Coupon_id']>0]
test_dict = test_dataset[FEATURES].T.to_dict().values()
prediction = fm.predict(test_dict)

NameError: name 'FEATURES' is not defined

In [None]:
from sklearn.metrics import roc_curve, auc

def evaluate(result_df):
    group = result_df.groupby(['Coupon_id'])
    aucs = []
    for i in group:
        tmpdf = i[1]        
        if len(tmpdf['Is_in_day_consume'].unique()) != 2:
            continue
            
        fpr, tpr, thresholds = roc_curve(tmpdf['Is_in_day_consume'], tmpdf['Probability'], pos_label=1)
        auc_score = auc(fpr,tpr)
        aucs.append(auc_score)
            
    return np.average(aucs)

In [None]:
test_result_df = test_dataset.copy()
test_result_df['Probability'] = prediction
test_result_df['Is_in_day_consume'] = test_label_dataset
evaluate(test_result_df)