## Factorization Machines with xLearn
### ref:` https://xlearn-doc.readthedocs.io/en/latest/index.html`

In [None]:
import xlearn as xl

# Training task
ffm_model = xl.create_ffm() # Use field-aware factorization machine

# On-disk training
ffm_model.setOnDisk()

ffm_model.setTrain("./small_train.txt")  # Training data
ffm_model.setValidate("./small_test.txt")  # Validation data

# param:
#  0. binary classification
#  1. learning rate: 0.2
#  2. regular lambda: 0.002
#  3. evaluation metric: accuracy
param = {'task':'binary', 'lr':0.2,
         'lambda':0.002, 'metric':'acc'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')

# Prediction task
ffm_model.setTest("./small_test.txt")  # Test data
ffm_model.setSigmoid()  # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")

NameError: name 'Pipeline' is not defined

In [1]:
import numpy as np
import xlearn as xl
from sklearn.model_selection import train_test_split
# Avazu competitition using pandas and sklearn library
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import category_encoders as ce
import xlearn as xl
%matplotlib inline
from datetime import datetime, date, time
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss
from sklearn.feature_extraction import FeatureHasher
from sklearn import preprocessing 
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from xgboost import XGBClassifier #note: activate the xgboost environment
from lightgbm import LGBMClassifier

# Load dataset

# method of handling categoricals: 'encode' or 'hash'
categorical_method='encode'

# build a preprocessing pipeline for feature hashing
preproc = Pipeline([('fh',FeatureHasher( n_features=n_features,input_type='string', non_negative=False))])



for data in pd.read_table(train_file, sep=',', chunksize=chunk_size, names=header,header=None,\
       nrows = n_rows):

    # get next batch and preprocess
    y_train, X_train, data = transform_categorical_features(data,categorical_method=categorical_method,drop_column=drop_column)

X_train, X_val, y_train, y_val = \
    train_test_split(X_train, y_train, test_size=0.3, random_state=0)

skip = True
if(not skip):
    # param:
    #  0. binary classification
    #  1. model scale: 0.1
    #  2. epoch number: 10 (auto early-stop)
    #  3. learning rate: 0.1
    #  4. regular lambda: 1.0
    #  5. use sgd optimization method
    model = xl.LRModel(task='binary', init=0.1,
                              epoch=10, lr=0.1,
                              reg_lambda=1.0, opt='sgd')

    # Start to train
    model.fit(X_train, y_train,
                     eval_set=[X_val, y_val],
                     is_lock_free=False)

    # Generate predictions
    y_pred = model.predict(X_val)


# param:    # param:
    #  0. binary classification
    #  1. model scale: 0.1
    #  2. epoch number: 10 (auto early-stop)
    #  3. learning rate: 0.1
    #  4. regular lambda: 1.0
    #  5. use sgd optimization method
    model = xl.LRModel(task='binary', init=0.1,
                              epoch=10, lr=0.1,
                              reg_lambda=1.0, opt='sgd')

    # Start to train
    model.fit(X_train, y_train,
                     eval_set=[X_val, y_val],
                     is_lock_free=False)

    # Generate predictions
    y_pred = model.predict(X_val)

#  0. binary classification
#  1. learning rate: 0.2
#  2. regular lambda: 0.002
#  3. evaluation metric: accuracy
param = {'task':'binary', 'lr':0.2,
         'lambda':0.002, 'metric':'logloss'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './FFM_model.out')

# Prediction task
ffm_model.setSigmoid()  # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./FFM_model.out", "./FFM_output.txt")



def transform_categorical_features(data,categorical_method,drop_column):
    
    # get click values
    y_train = data['click'].values
        
    # target labels
    y_train = np.asarray(y_train).ravel()
        
    # remove id and click columns
    data = data.drop(['id','click'], axis=1)
    print('start with ' +str(data.shape[1])+' features')
    print(data.columns)
    
    # add engineered features related to datetime
    add_engineered_datetime_features = True
    if(add_engineered_datetime_features):    
        data['hour']=data['hour'].map(lambda x: datetime.strptime(str(x),"%y%m%d%H"))
        data['dayoftheweek']=data['hour'].map(lambda x:  x.weekday())
        data['day']=data['hour'].map(lambda x:  x.day)
        data['hour']=data['hour'].map(lambda x:  x.hour)
        
    # column names
    # header = ['hour','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id',\
    #        'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21']

    # remove features
    # data = data.drop(['C1'],axis=1)
    # print(data.columns)
    # drop_column = []
    print('dropping '+str(data.columns[drop_column]))
    data = data.drop(data.columns[drop_column], axis=1) 
    
    
    # hash or encode the features
    if(categorical_method == 'hash'):
        
        # convert all features to str
        X_train = np.asarray(data.astype(str))
    
        # hash all features
        print('Hashing features...')
        
        # features
        X_train = preproc.fit_transform(X_train)

        
    elif(categorical_method == 'encode'):
        
        # encode all features
        print('Encoding features numerically...')
        
        # Encode columns
        encoder = preprocessing.LabelEncoder()
        data = data.apply(encoder.fit_transform)
        
        # one-hot encode -- fails due to memory error
        # encoder = preprocessing.OneHotEncoder()
        # encoder.fit(data)
        # data = encoder.transform(data).toarray()
        
        print('There are ' +str(data.shape[1])+' features')
        print(data.columns)
        print(data.info())
       
        # binarize features
        binarize=False
        if(binarize):
            print('Binarizing features...')


            # From category_encoders
            encoder = ce.BinaryEncoder(cols=data.columns.tolist()).fit(data)
            data = encoder.transform(data)
        
        # features
        X_train = np.asarray(data) 
        
   
    return y_train, X_train, data



XLearnLibraryNotFound: Cannot find xlearn Library in the candidate path