### Offline_feature_store

In [7]:
import pandas as pd
import numpy as np
import random
import os
import sys
import torch
import torch.nn.functional as F
import time
import feast

from sklearn.utils import shuffle
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler, RobustScaler

from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import lightgbm as lgb
from xgboost import XGBClassifier
from xgboost import plot_tree
from xgboost import plot_importance

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [8]:
import nni
import argparse
import logging
import logging.handlers

In [9]:
from utils_ import save_checkpoint

### preprocessing_from_offine_store

In [10]:
def get_train_from_offline_store(path_):
    # Connect to your feature store provider
    fs = feast.FeatureStore(repo_path=f"{path_}")


    ### entity_df 
    parquet_ = pd.read_parquet(f'{path_}/data/ppr_data_.parquet', engine='pyarrow')
    orders = parquet_[['ticket_id','event_timestamp']]

    # Retrieve training data
    training_df = fs.get_historical_features(
        entity_df=orders,
        features=[
            "dr_lauren_stat:time",
            "dr_lauren_stat:weekday",
            "dr_lauren_stat:weekend",
            "dr_lauren_stat:instlo_1",
            "dr_lauren_stat:instlo_2",
            "dr_lauren_stat:inst_code",
            "dr_lauren_stat:sysname_lo",
            "dr_lauren_stat:sysname_eq",
            "dr_lauren_stat:ntt_label",
        ],
    ).to_df()


    ### training_part_before 7 month

    criterion = '2021-07-01'

    training_df_ = training_df[training_df['event_timestamp'] < criterion]
    
    return training_df_


In [11]:
get_train_from_offline_store('/workspace/ML_Ops/feast/fea_')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df[entity_df_event_timestamp_col] = entity_df[


Unnamed: 0,event_timestamp,ticket_id,time,weekday,weekend,instlo_1,instlo_2,inst_code,sysname_lo,sysname_eq,ntt_label
0,2021-01-01 03:17:00+00:00,3604002,3.283333,4,0,9,47,531,117,0,1
1,2021-01-01 03:17:00+00:00,3604002,3.283333,4,0,9,47,531,117,0,1
2,2021-01-01 03:20:00+00:00,3604035,3.333333,4,0,12,84,340,913,0,1
3,2021-01-01 03:20:00+00:00,3604035,3.333333,4,0,12,84,340,913,0,1
4,2021-01-02 08:38:00+00:00,3624904,8.633333,5,1,10,101,435,868,0,1
...,...,...,...,...,...,...,...,...,...,...,...
14730,2021-06-30 18:33:00+00:00,9346264,18.550000,2,0,2,67,400,173,0,0
14731,2021-06-30 18:35:00+00:00,9346310,18.583333,2,0,2,67,375,1506,0,0
14732,2021-06-30 18:35:00+00:00,9346329,18.583333,2,0,2,67,382,1978,0,0
14733,2021-06-30 18:40:00+00:00,9346522,18.666667,2,0,2,122,1504,161,0,0


In [12]:
### train_test_split
def split_tr_te(df_,test_size_, seed_):
    x = df_[['time', 'weekday', 'weekend', 'instlo_1', 'instlo_2', 'inst_code', 'sysname_lo', 'sysname_eq']]
    y = df_['ntt_label']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size_, random_state=seed_)
    
    return x_train, x_test, y_train, y_test

  and should_run_async(code)


## model

### LGBM

In [13]:
def get_lgbm_score(model_df, test_size_, ne_, seed_):
    x_train, x_test, y_train, y_test = split_tr_te(model_df, test_size_, seed_)

    mo_ = lgb.LGBMClassifier(n_estimators=ne_)
    mo_.fit(x_train, y_train)

    y_pred = mo_.predict(x_test)
    
    y_prob = mo_.predict_proba(x_test)[:,1]

    acc = accuracy_score(y_test, y_pred)

    f1_score_ = f1_score(y_test, y_pred)

    auc = roc_auc_score(y_test, y_prob)    
    
    return mo_, acc, f1_score_, auc

### XGBM

In [14]:
def get_xgb_score(model_df, test_size_, ne_, seed_):
    x_train, x_test, y_train, y_test = split_tr_te(model_df, test_size_, seed_)

    mo_ = XGBClassifier(n_estimators=ne_)
    mo_.fit(x_train, y_train)

    y_pred = mo_.predict(x_test)
    
    y_prob = mo_.predict_proba(x_test)[:,1]
    
    acc = accuracy_score(y_test, y_pred)

    f1_score_ = f1_score(y_test, y_pred)
    
    auc = roc_auc_score(y_test, y_prob)
    
    return mo_, acc, f1_score_, auc, x_test

### 뽑고싶은 entity_rows

In [23]:
parquet_ = pd.read_parquet(f'/workspace/ML_Ops/feast/fea_/data/ppr_data_.parquet', engine='pyarrow')
orders = parquet_[['ticket_id','event_timestamp']]

### 뽑을 rows의 entity == key 라고 볼 수 있음
new_orders = orders[orders['event_timestamp']>= '2021-07-01']
new_orders.drop_duplicates(inplace=True, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_orders.drop_duplicates(inplace=True, ignore_index=True)


In [24]:
new_orders

Unnamed: 0,ticket_id,event_timestamp
0,9360646,2021-07-01 02:47:00
1,9361358,2021-07-01 03:11:00
2,9363518,2021-07-01 04:25:00
3,9365171,2021-07-01 05:29:00
4,9365176,2021-07-01 05:29:00
...,...,...
1368,9995149,2021-07-15 18:37:00
1369,9995205,2021-07-15 18:39:00
1370,9995206,2021-07-15 18:39:00
1371,9995241,2021-07-15 18:40:00


### Online_features

In [26]:
fs_ = feast.FeatureStore(repo_path="/workspace/ML_Ops/feast/fea_/")





online_ = fs_.get_online_features(
            entity_rows=[{"ticket_id": i} for i in new_orders['ticket_id']],
            features=[
                "dr_lauren_stat:time",
                "dr_lauren_stat:weekday",
                "dr_lauren_stat:weekend",
                "dr_lauren_stat:instlo_1",
                "dr_lauren_stat:instlo_2",
                "dr_lauren_stat:inst_code",
                "dr_lauren_stat:sysname_lo",
                "dr_lauren_stat:sysname_eq",
                "dr_lauren_stat:ntt_label",
        ],
)


df = pd.DataFrame.from_dict(online_.to_dict())

print(df)

      ntt_label  weekday  sysname_lo  inst_code       time  sysname_eq  \
0             1        3        1584        781   2.783333           0   
1             1        3         972        693   3.183333           0   
2             1        3        2200        293   4.416667           0   
3             1        3         834         91   5.483333           0   
4             1        3         820        525   5.483333           0   
...         ...      ...         ...        ...        ...         ...   
1368          0        3         173        400  18.616667           0   
1369          0        3        1506        375  18.650000           0   
1370          0        3        1978        382  18.650000           0   
1371          1        3        2061        642  18.666667           0   
1372          1        3        2047       1215  18.816667           0   

      instlo_2  ticket_id  instlo_1  weekend  
0           15    9360646         3        0  
1           38   

In [27]:
df[['time', 'weekday', 'weekend', 'instlo_1', 'instlo_2', 'inst_code', 'sysname_lo', 'sysname_eq','ntt_label']]

  and should_run_async(code)


Unnamed: 0,time,weekday,weekend,instlo_1,instlo_2,inst_code,sysname_lo,sysname_eq,ntt_label
0,2.783333,3,0,3,15,781,1584,0,1
1,3.183333,3,0,12,38,693,972,0,1
2,4.416667,3,0,3,53,293,2200,0,1
3,5.483333,3,0,11,37,91,834,0,1
4,5.483333,3,0,11,37,525,820,0,1
...,...,...,...,...,...,...,...,...,...
1368,18.616667,3,0,2,67,400,173,0,0
1369,18.650000,3,0,2,67,375,1506,0,0
1370,18.650000,3,0,2,67,382,1978,0,0
1371,18.666667,3,0,9,65,642,2061,0,1
