In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack, coo_matrix
from tqdm import tqdm
import sparse as sp
from opt_einsum import contract
from copy import deepcopy
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold

In [2]:
path = './data/'
df = pd.read_csv(path + "CrossDomain_2years.csv", index_col=0)
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,index,SEX,AGE,AREA,Salary,domein,job[0],job[1],job[2],job[3],...,importance[15],importance[16],importance[17],importance[18],Company,most_importance,question,sentence,factor,SCORE
0,0,男性,60,神奈川県,1000万円未満,1-1-18：医療保険,運送・輸送業,,,,...,,,,,チューリッヒ生命,保障内容に対する保険料,Q2_1,申込み窓口の豊富さ（取扱い店舗、ネット申込み、保険営業員 等）,加入手続き,6
1,0,男性,60,神奈川県,1000万円未満,1-1-18：医療保険,運送・輸送業,,,,...,,,,,チューリッヒ生命,保障内容に対する保険料,Q2_2,加入手続きの容易さ（記入事項の分かりやすさ、提出書類の内容、情報提供、等を含む）,加入手続き,6
2,0,男性,60,神奈川県,1000万円未満,1-1-18：医療保険,運送・輸送業,,,,...,,,,,チューリッヒ生命,保障内容に対する保険料,Q2_3,加入手続き完了まで（契約完了まで）のスピード,加入手続き,6
3,0,男性,60,神奈川県,1000万円未満,1-1-18：医療保険,運送・輸送業,,,,...,,,,,チューリッヒ生命,保障内容に対する保険料,Q2_4,商品内容の分かりやすさ,商品内容,6
4,0,男性,60,神奈川県,1000万円未満,1-1-18：医療保険,運送・輸送業,,,,...,,,,,チューリッヒ生命,保障内容に対する保険料,Q2_5,商品内容の充実度（特約等を含む）,商品内容,6


In [3]:
df_count = df.groupby(["domein", "Company"])["index"].nunique().reset_index().rename(columns = {"index":"count"})
df = pd.merge(df, df_count, on=["domein", "Company"])
df = df[df["count"] > 100].drop("count",axis=1)

In [4]:
df_X = df[["SEX", "AGE", "Salary", "domein", "job[0]", "job[1]", "importance[0]", "importance[1]", "Company", "question", "sentence", "factor"]].fillna("Nan").copy()
df_y = df["SCORE"].copy()

In [13]:
class Model:
    def __init__(self, X_train:pd.DataFrame, y_train:pd.Series):
        raise NotImplementedError

    def __call__(self, X_test: pd.DataFrame) -> pd.Series:
        # predict here
        raise NotImplementedError


class FwFM(Model):
    """ Field weighted Factorization Machines implemented by scipy.sparse.csr_matrix
    See colaboratory notebook https://colab.research.google.com/drive/1AsOLL_7ON_Fl22rIJ3RngvLAxe5IOmBh?usp=sharing 
    or original paper https://arxiv.org/pdf/1806.03514.pdf  for details.

    requirements:
        pandas,
        numpy,
        sklearn,
        scipy,
        tqdm,
        opt_einsum
    """
    def __init__(self, df_X: pd.DataFrame, df_y: pd.Series, dim:int =8, lr:float = 1e-3, 
                 n_epoch: int = 10, n_batch: int =256, ignore_interactions : list=[], train:bool=True, 
                 lam_w: float = 0, lam_v: float = 0, lam_r: float = 0):
        """
        Arguments:
            df_X [pd.DataFrame] : Explanatory variables (all columns are categorical)
            df_y [pd.Series] : Objective variable
            dim [int] : a number of dimention of embeddings.
            lr [float] : learning rate
            n_epoch [int] : a number of epoch/
            n_batch [int] : a number of sample in mini-batch.
            ignore_interactions [list] : element is pair of fields which you ignored interaction.
            train [bool] : wheter run train or not when initializing.
            lam_w [float] : weight of l2 norm for w.
            lam_v [float] : weight of l2 norm for v.
            lam_r [float] : weight of l2 norm for r.

        """
        self.dim = dim
        self.lr = lr
        self.n_epoch = n_epoch 
        self.n_batch = n_batch 
        self.ignore_interactions = ignore_interactions
        self.lam_w = lam_w
        self.lam_v = lam_v
        self.lam_r = lam_r

        self._preprocess(df_X, df_y)
        if train:
            self.train()

    def _preprocess(self, df_X: pd.DataFrame,  df_y: pd.Series):
        self.fields = df_X.columns 
        self.fields_dir = {}

        field_start_idx = 0
        for i, field in enumerate(self.fields):
            ohe = OneHotEncoder(handle_unknown="ignore")
            X_field = ohe.fit_transform(df_X[[field]])
            cols = ohe.categories_[0]

            if i == 0:
                X = csr_matrix(X_field)
            else:
                X = hstack([X, X_field])

            self.fields_dir[field] = {
                "field_idx":i,
                "start_idx": field_start_idx, 
                "end_idx": field_start_idx + len(cols), 
                "cols": deepcopy(cols),
                "encoder": deepcopy(ohe), 
            }
            field_start_idx += len(cols)

        self.b = 0
        self.w = np.random.rand(X.shape[1]) / 10
        self.v = np.random.rand(X.shape[1], self.dim) / 10
        self.r = np.random.rand(len(self.fields), len(self.fields))/ 10
        self.r_mask = np.ones([len(self.fields), len(self.fields)])

        for i in range(len(self.fields)):
            for j in range(i, len(self.fields)):
                self.r_mask[i, j] = 0

        for interaction in self.ignore_interactions:
            field_i, field_j = tuple(interaction)
            field_i_idx = self.fields_dir[field_i]["field_idx"]
            field_j_idx = self.fields_dir[field_j]["field_idx"]
            self.r_mask[field_i_idx, field_j_idx] = 0
            self.r_mask[field_j_idx, field_i_idx] = 0

        self.r = self.r * self.r_mask
    
        self.m2f = np.zeros([X.shape[1], len(self.fields)])
        for i, field in enumerate(self.fields):
            self.m2f[np.arange(self.fields_dir[field]["start_idx"], self.fields_dir[field]["end_idx"]), i] = 1  

        self.X = csr_matrix(X)
        self.y = df_y.values

    def train(self):
        n_iter = int(self.X.shape[0] / self.n_batch)
        indices = np.arange(self.X.shape[0])

        for ep in range(self.n_epoch):
            np.random.shuffle(indices)
            for i in tqdm(range(n_iter)):
                batch_indices = indices[self.n_batch * i : self.n_batch * (i + 1)]
                X_batch = self.X[batch_indices]
                y_batch = self.y[batch_indices]

                y_hat = self.predict(X_batch)
                a = -2 * (y_batch - y_hat)

                dL_db = (a * 1).mean()
                dL_dw = (self.der_w(X_batch, reduction=None).T * a) / a.shape
                dL_dv = (a * self.der_v(X_batch, reduction=None)).mean(axis=2)
                dL_dr = (a * self.der_r(X_batch, reduction=None)).mean(axis=2)
                self.update(dL_db, dL_dw, dL_dv, dL_dr)

        return self

    def der_w(self, X: csr_matrix, reduction="mean") -> csr_matrix:
        dw = X
        if reduction == "mean":
            dw = dw.mean(axis=0)

        return dw

    def der_v(self, X: csr_matrix, reduction="mean") -> np.ndarray:
        dv = contract("ni,if,fg,nj,jd,jg->idn", X.A, self.m2f, self.r, X.A, self.v, self.m2f)
        
        if reduction == "mean":
            dv = dv.mean(axis=2)
        return dv

    def der_r(self, X : csr_matrix, reduction="mean") -> np.ndarray:
        dr = contract("ni,id,if,fg,bj,jd,jg->fgn", X.A, self.v, self.m2f, self.r_mask, X.A, self.v, self.m2f)
        
        if reduction == "mean":
            dr = dr.mean(axis=2)

        return dr

    def constraint_r(self, r):
        return 

    def update(self, dL_db, dL_dw, dL_dv, dL_dr):
        self.b -= dL_db * self.lr 
        self.w -= dL_dw * self.lr
        self.v -= dL_dv * self.lr
        self.r -= (dL_dr * self.lr + self.lam_r * self.r)


    def predict(self, X: csr_matrix):
        y_hat = contract("ni,id,if,fg,nj,jd,jg->n", X.A, self.v, self.m2f, self.r, X.A, self.v, self.m2f)
        return y_hat


    def convert_sparse(self, df_X: pd.DataFrame):
        for i, field in enumerate(self.fields):
            X_field = self.fields_dir[field]["encoder"].transform(df_X[[field]])
            if i == 0:
                X = csr_matrix(X_field)
            else:
                X = hstack([X, X_field])
        return csr_matrix(X)

    def __call__(self, df_X: pd.DataFrame, chunk_size = 1024):
        X = self.convert_sparse(df_X)
        indices = np.arange(X.shape[0])
        n_splits = int(X.shape[0] / chunk_size)
        y_hat = np.array([])
        for chunk_indices in tqdm(np.array_split(indices, n_splits)):
            y_hat = np.r_[y_hat, self.predict(X[chunk_indices])]
        return y_hat


In [17]:
group_kfold = GroupKFold(n_splits=3)
userid = df["index"].values
for train_index, test_index in group_kfold.split(df_X, df_y, userid):
    df_X_train, df_y_train = df_X.iloc[train_index], df_y.iloc[train_index]
    df_X_test, df_y_test = df_X.iloc[test_index], df_y.iloc[test_index]
    fwfm = FwFM(df_X_train, df_y_train, n_epoch=10)
    y_hat = fwfm(df_X_test)
    print(mean_squared_error(df_y_test.values, y_hat) ** 0.5)
    break

100%|██████████| 682/682 [28:23<00:00,  2.50s/it]
100%|██████████| 682/682 [26:18<00:00,  2.32s/it]
100%|██████████| 682/682 [26:07<00:00,  2.30s/it]


1.8614387714670322


100%|██████████| 682/682 [26:11<00:00,  2.30s/it]
100%|██████████| 682/682 [25:59<00:00,  2.29s/it]
100%|██████████| 682/682 [26:04<00:00,  2.29s/it]


1.8457312274693052


100%|██████████| 682/682 [26:06<00:00,  2.30s/it]
100%|██████████| 682/682 [26:06<00:00,  2.30s/it]
100%|██████████| 682/682 [25:57<00:00,  2.28s/it]


1.8625713584735466
