In [1]:
import os
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pandas.api.types import CategoricalDtype
from scipy import sparse
import implicit
from implicit.evaluation import precision_at_k, train_test_split
from implicit.als import AlternatingLeastSquares
import gc
import numpy as np
from tqdm import tqdm
import pickle

In [2]:
#!mkdir '/srv/data/vk'
#!mkdir '/srv/data/vk/old'
#!mkdir '/srv/data/vk/train'
#!mkdir '/srv/data/vk/test'

In [3]:
from implicit.cpu.als import check_random_state, check_csr
import time
import logging
log = logging.getLogger("implicit")

def fit(self, user_items, show_progress=True, callback=None):
        """Factorizes the user_items matrix.

        After calling this method, the members 'user_factors' and 'item_factors' will be
        initialized with a latent factor model of the input data.

        The user_items matrix does double duty here. It defines which items are liked by which
        users (P_ui in the original paper), as well as how much confidence we have that the user
        liked the item (C_ui).

        The negative items are implicitly defined: This code assumes that positive items in the
        user_items matrix means that the user liked the item. The negatives are left unset in this
        sparse matrix: the library will assume that means Piu = 0 and Ciu = 1 for all these items.
        Negative items can also be passed with a higher confidence value by passing a negative
        value, indicating that the user disliked the item.

        Parameters
        ----------
        user_items: csr_matrix
            Matrix of confidences for the liked items. This matrix should be a csr_matrix where
            the rows of the matrix are the users, the columns are the items liked that user,
            and the value is the confidence that the user liked the item.
        show_progress : bool, optional
            Whether to show a progress bar during fitting
        callback: Callable, optional
            Callable function on each epoch with such arguments as epoch, elapsed time and progress
        """
        # initialize the random state
        random_state = check_random_state(self.random_state)

        Cui = check_csr(user_items)
        if Cui.dtype != np.float32:
            Cui = Cui.astype(np.float32)

        # Give the positive examples more weight if asked for
        if self.alpha != 1.0:
            Cui = self.alpha * Cui

        s = time.time()
        Ciu = Cui.T.tocsr()
        log.debug("Calculated transpose in %.3fs", time.time() - s)

        items, users = Ciu.shape

        s = time.time()
        # Initialize the variables randomly if they haven't already been set
        if self.user_factors is None:
            self.user_factors = random_state.rand(users, self.factors).astype(self.dtype) * 0.01
        if self.item_factors is None:
            self.item_factors = random_state.rand(items, self.factors).astype(self.dtype) * 0.01

        log.debug("Initialized factors in %s", time.time() - s)

        # invalidate cached norms and squared factors
        self._item_norms = self._user_norms = None
        self._YtY = None
        self._XtX = None
        loss = None

        solver = self.solver

        log.debug("Running %i ALS iterations", self.iterations)
        with tqdm(total=self.iterations, disable=not show_progress) as progress:
            # alternate between learning the user_factors from the item_factors and vice-versa
            for iteration in range(self.iterations):
                s = time.time()
                solver(
                    Cui,
                    self.user_factors,
                    self.item_factors,
                    self.regularization,
                    num_threads=self.num_threads,
                )
                if False:
                    solver(
                        Ciu,
                        self.item_factors,
                        self.user_factors,
                        self.regularization,
                        num_threads=self.num_threads,
                    )
                progress.update(1)

                if self.calculate_training_loss:
                    loss = _als.calculate_loss(
                        Cui,
                        self.user_factors,
                        self.item_factors,
                        self.regularization,
                        num_threads=self.num_threads,
                    )
                    progress.set_postfix({"loss": loss})

                    if not show_progress:
                        log.info("loss %.4f", loss)

                # Backward compatibility
                if not callback:
                    callback = self.fit_callback
                if callback:
                    callback(iteration, time.time() - s, loss)

        if self.calculate_training_loss:
            log.info("Final training loss %.4f", loss)

        self._check_fit_errors()

In [4]:
class FeatureMaking:
    
    def __init__(self, path_to_save, iloc_start, iloc_end=None, production_flg = False):
        self.path_to_save = path_to_save
        self.iloc_start = iloc_start
        self.iloc_end = iloc_end
        self.production_flg = production_flg 
        
        
    def load_data(self, load_path = './'):
        full_df = pd.read_parquet(os.path.join(load_path,'train.parquet.gzip'))
        self.item_df = pd.read_parquet(os.path.join(load_path,'items_meta.parquet.gzip'))
        self.item_source_dct = self.item_df.set_index('item_id')['source_id'].to_dict()
        full_df['source_id'] = full_df['item_id'].map(self.item_source_dct)

        if self.production_flg:
            self.train_df = full_df.reset_index(drop=True)
            self.target_df = None
            self.candidates_df = pd.read_parquet(os.path.join(load_path,'fresh_candidates.parquet.gzip'))
            self.test_user_df = pd.read_parquet(os.path.join(load_path,'test.parquet.gzip'))
            
        else:
            self.train_df = full_df.iloc[:self.iloc_start].reset_index(drop=True)
            
            if self.iloc_end is None:
                self.target_df = full_df.iloc[self.iloc_start:].reset_index(drop=True)
            else:
                self.target_df = full_df.iloc[self.iloc_start:self.iloc_end].reset_index(drop=True)
            self.candidates_df = self.target_df[['item_id']].drop_duplicates().reset_index(drop=True)
            self.test_user_df = self.target_df[['user_id']].drop_duplicates().reset_index(drop=True)
        
        
        
        self.train_df['reaction_abs'] = np.abs(self.train_df['reaction']).astype('float32')
        self.train_df['good'] = (self.train_df['timespent']>0).astype('int')
        self.train_df = self.train_df.reset_index()
        self.train_df['good'] = (self.train_df['timespent']>0).astype('int')
        
    def get_feature_source_df(self, n_last_row = 1000000):
        filepath = os.path.join(self.path_to_save,'feature_source_df.parquet.gzip')
        try:
            feature_source_df = pd.read_parquet(filepath)
        except FileNotFoundError:
            feature_source_df = self.train_df.iloc[-n_last_row:].groupby('source_id').agg({'good':('mean','sum')})
            feature_source_df.columns = ['source_good_mean', 'source_good_sum']
            feature_source_df.reset_index().to_parquet(filepath, compression='gzip')
        return feature_source_df
    
    def get_feature_source_user_df(self):
        filepath = os.path.join(self.path_to_save,'feature_source_user_df.parquet.gzip')
        try:
            feature_source_user_df = pd.read_parquet(filepath)
        except FileNotFoundError:
            feature_source_user_df = self.train_df[train_df['user_id'].isin(self.test_user_df.user_id)].groupby(['user_id','source_id']).agg({
                    'item_id':'nunique',
                    'timespent':'sum',
                    'good':('mean','sum'),
                    'reaction':'mean',
                    'reaction_abs':('mean','sum')})
            feature_source_user_df.columns = ['cnt_items',
                                          'time_sum',
                                          'good_mean',
                                          'good_sum',
                                          'reaction_mean',
                                          'reaction_abs_mean',
                                          'reaction_abs_sum']
    
            feature_source_user_df.reset_index().to_parquet(filepath, compression='gzip')

        return feature_source_user_df
    
    def get_feature_item_df(self):
        filepath = os.path.join(self.path_to_save,'feature_item_df.parquet.gzip')
        try:
            feature_item_df = pd.read_parquet(filepath)
        except FileNotFoundError:
            feature_item_df = train_df.groupby('item_id').agg({'user_id':'nunique',
                                             'timespent':'mean',
                                             'good':'mean',
                                             'reaction_abs':'mean'}).reset_index().rename(
                columns = {'user_id':'cnt_users_by_item',
                           'timespent':'mean_time_by_item',
                           'good':'mean_good_by_item',
                           'reaction_abs':'mean_abs_react_by_item'})
            feature_item_df['mean_abs_react_by_item'] = feature_item_df['mean_abs_react_by_item'].astype('float32')
            feature_item_df.to_parquet(filepath, compression='gzip')
    


        feature_item_df['pretarget_time_sum_5m'] = feature_item_df['item_id'].map(
            self.train_df.iloc[-5000000:].groupby('item_id')['timespent'].sum().to_dict()).fillna(0)

        feature_item_df['pretarget_time_sum_1m'] = feature_item_df['item_id'].map(
            self.train_df.iloc[-1000000:].groupby('item_id')['timespent'].sum().to_dict()).fillna(0)

        feature_item_df['pretarget_good_sum_5m'] = feature_item_df['item_id'].map(
            self.train_df.iloc[-5000000:].groupby('item_id')['good'].sum().to_dict()).fillna(0)

        feature_item_df['pretarget_good_sum_1m'] = feature_item_df['item_id'].map(
            self.train_df.iloc[-1000000:].groupby('item_id')['good'].sum().to_dict()).fillna(0)


        feature_item_df['pretarget_prc'] = feature_item_df['pretarget_time_sum_1m']/(feature_item_df[
            'pretarget_time_sum_5m']+0.1)

        feature_item_df['source_id'] = feature_item_df['item_id'].map(self.item_source_dct)
        
        return feature_item_df
    
    def get_feature_item_emb_cosine_df(self, top_feature_item_df):
        filepath = os.path.join(self.path_to_save,'feature_item_emb_cosine_df.parquet.gzip')
        try:
            feature_item_emb_cosine_df = pd.read_parquet(filepath)
        except FileNotFoundError:
            embedding_dct = item_df.set_index('item_id')['embeddings'].apply(np.array).to_dict()
            users_by_item_dct = train_df[train_df['timespent']>0].groupby('item_id')['user_id'].apply(set).to_dict()
            item_arr = train_df[train_df['timespent']>0]['item_id'].unique()

            embed_matrix_lst = []
            for item_id in item_arr:
                embed_matrix_lst.append(embedding_dct[item_id])
            embed_matrix = np.array(embed_matrix_lst)

            recommend_item_df_lst = []

            for item_id_arr in tqdm(np.array_split(top_feature_item_df.item_id.values,50)):
                matrix_one = np.array([embedding_dct[item_id] for item_id in item_id_arr])
                one_cosine = cosine_similarity(matrix_one,embed_matrix)

                for i, item_id in enumerate(item_id_arr):
                    tmp_df = pd.DataFrame({'item_recommend_id':item_id,
                              'item_id': item_arr, 
                              'cosine':one_cosine[i]}).sort_values(
                    'cosine', ascending = False)
                    recommend_item_df_lst.append(tmp_df[tmp_df['cosine']>0.9])

            recom_df = pd.concat(recommend_item_df_lst).reset_index(drop=True)
            recom_df = recom_df[recom_df['item_recommend_id']!=recom_df['item_id']].reset_index(drop=True)

            full_recom_lst = []
            for item_id_arr in tqdm(np.array_split(top_feature_item_df.item_id.values,100)):
                tmp_df = recom_df[recom_df['item_recommend_id'].isin(item_id_arr)].merge(
                    train_df[train_df['timespent']>0][['item_id','user_id']]).sort_values(
                        'cosine', ascending = False).groupby(['item_recommend_id','user_id']).head(1)[
                        ['item_recommend_id','user_id','cosine']].rename(columns = {'item_recommend_id':'item_id'})
                full_recom_lst.append(tmp_df)

            full_recom_df = pd.concat(full_recom_lst).sort_values(
                    'cosine', ascending = False).groupby(['item_id','user_id']).head(1).reset_index(drop=True)

            full_recom_df[['item_id','user_id','cosine']].to_parquet(filepath, compression='gzip')
            feature_item_emb_cosine_df = full_recom_df[['item_id','user_id','cosine']]

        return feature_item_emb_cosine_df
        
    def get_best_items(self, feature_item_df):
        filepath = os.path.join(self.path_to_save,'best_150_df.parquet.gzip')
        try:
            best_150_df = pd.read_parquet(filepath)
        except FileNotFoundError:
        
            best_item_df = feature_item_df[feature_item_df['item_id'].isin(self.candidates_df.item_id)].sort_values(
                'pretarget_time_sum_5m', ascending = False).reset_index(drop=True).head(150)
            best_item_df['flg'] = 1
            target_user_df = self.test_user_df[['user_id']].drop_duplicates().reset_index(drop=True)
            target_user_df['flg'] = 1
            best_150_df = best_item_df.merge(target_user_df)[['user_id','item_id']]
            best_150_df.to_parquet(filepath, compression='gzip')

        return best_150_df
    
    
    def get_feature_als_df(self):
        filepath = os.path.join(self.path_to_save,'feature_als_512_15_df.parquet.gzip')
        try:
            feature_als_df = pd.read_parquet(filepath)
        except FileNotFoundError:

            users = train_df['user_id'].unique()
            items = train_df['item_id'].unique()
            shape = (len(users), len(items))

            # Create indices for users and movies
            user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
            items_cat = CategoricalDtype(categories=sorted(items), ordered=True)
            user_index = train_df['user_id'].astype(user_cat).cat.codes
            item_index = train_df['item_id'].astype(items_cat).cat.codes
            user_item_rating_csr = sparse.coo_matrix((self.train_df["timespent"]+1, 
                                                      (user_index, item_index)), shape=shape).tocsr()
            from_user_id_to_index_dct = {}
            for i, k in enumerate(user_cat.categories):
                from_user_id_to_index_dct[k] = i

            from_index_to_user_id_dct = {}
            for i, k in enumerate(user_cat.categories):
                from_index_to_user_id_dct[i] = k

            from_index_to_item_id_dct = {}
            for i, k in enumerate(items_cat.categories):
                from_index_to_item_id_dct[i] = k


            #items_to_predict = target_df[target_df['item_id'].isin(items)].item_id.unique()

            items_to_predict = target_df[target_df['item_id'].isin(items)].item_id.unique()
            items_to_predict_ind = pd.Series(items_to_predict).astype(items_cat).cat.codes.values

            from_index_predict_to_item_id_dct = {}
            for i in range(len(items_to_predict)):
                from_index_predict_to_item_id_dct[i] = items_to_predict[i]

            als_model = AlternatingLeastSquares(factors=512,
                                                use_gpu=False,
                                                regularization=0.1,
                                                alpha=1,
                                                iterations=15)
            als_model.fit(user_item_rating_csr)

            sample_users = self.test_user_df[self.test_user_df['user_id'].isin(users)].user_id.unique()
            user_ind = [from_user_id_to_index_dct[user_id] for user_id in sample_users]
            recom_result = als_model.recommend(
                        user_ind,
                        user_item_rating_csr[user_ind],
                        N=100,
                        recalculate_user = False,
                        filter_already_liked_items=True,
                        items = items_to_predict_ind
                        )


            gc.collect()

            als_df_lst = []
            for i, user_id in tqdm(enumerate(sample_users)):
                tmp_df = pd.DataFrame({'user_id':user_id,
                                       'item_id':[from_index_to_item_id_dct[a] for a in recom_result[0][i]],
                                       'als_score':recom_result[1][i]})
                als_df_lst.append(tmp_df)

            feature_als_df = pd.concat(als_df_lst).reset_index(drop=True)
            feature_als_df.to_parquet(filepath, compression='gzip')
        
        return feature_als_df
    
    def get_feature_emb_als_tune_df(self):
        filepath = os.path.join(self.path_to_save,'feature_emb_als_df_3_3.parquet.gzip')
        try:
            feature_emb_als_df = pd.read_parquet(filepath)
        except FileNotFoundError:
        
            embedding_dct = item_df.set_index('item_id')['embeddings'].to_dict()
            embed_matrix_lst = []
            for i in range(len(items_cat.categories)):
                embed_matrix_lst.append(np.array(embedding_dct[from_index_to_item_id_dct[i]]))

            embed_matrix = np.array(embed_matrix_lst)

            als_embedding_model = implicit.als.AlternatingLeastSquares(factors=312,
                                        use_gpu=False,
                                        regularization=0.1,
                                        alpha=1.0,
                                        iterations=3)

            als_embedding_model.item_factors = embed_matrix.copy()

            fit(als_embedding_model, user_item_rating_csr)
            als_embedding_model.fit(user_item_rating_csr)

            sample_users = target_df[target_df['user_id'].isin(users)].user_id.unique()
            user_ind = [from_user_id_to_index_dct[user_id] for user_id in sample_users]
            als_emb_recom_result = als_embedding_model.recommend(
                        user_ind,
                        user_item_rating_csr[user_ind],
                        N=100,
                        recalculate_user = False,
                        filter_already_liked_items=True,
                        items = items_to_predict_ind
                        )

            emb_als_df_lst = []
            for i, user_id in tqdm(enumerate(sample_users)):
                tmp_df = pd.DataFrame({'user_id':user_id,
                                       'item_id':[from_index_to_item_id_dct[a] for a in als_emb_recom_result[0][i]],
                                       'emb_als_score':als_emb_recom_result[1][i]})
                emb_als_df_lst.append(tmp_df)

            feature_emb_als_df = pd.concat(emb_als_df_lst).reset_index(drop=True)
            feature_emb_als_df.to_parquet(filepath, compression='gzip')
            
        return feature_emb_als_df.rename(columns = {'emb_als_score':'emb_als_score_tune'})
    
    
    def get_feature_emb_als_df(self):
        filepath = os.path.join(self.path_to_save,'feature_emb_als_df.parquet.gzip')
        try:
            feature_emb_als_df = pd.read_parquet(filepath)
        except FileNotFoundError:
        
            embedding_dct = item_df.set_index('item_id')['embeddings'].to_dict()
            embed_matrix_lst = []
            for i in range(len(items_cat.categories)):
                embed_matrix_lst.append(np.array(embedding_dct[from_index_to_item_id_dct[i]]))

            embed_matrix = np.array(embed_matrix_lst)

            als_embedding_model = AlternatingLeastSquares(factors=312,
                                        use_gpu=False,
                                        regularization=0.1,
                                        alpha=1.0,
                                        iterations=15)

            als_embedding_model.item_factors = embed_matrix.copy()

            fit(als_embedding_model, user_item_rating_csr)

            sample_users = target_df[target_df['user_id'].isin(users)].user_id.unique()
            user_ind = [from_user_id_to_index_dct[user_id] for user_id in sample_users]
            als_emb_recom_result = als_embedding_model.recommend(
                        user_ind,
                        user_item_rating_csr[user_ind],
                        N=100,
                        recalculate_user = False,
                        filter_already_liked_items=True,
                        items = items_to_predict_ind
                        )

            emb_als_df_lst = []
            for i, user_id in tqdm(enumerate(sample_users)):
                tmp_df = pd.DataFrame({'user_id':user_id,
                                       'item_id':[from_index_to_item_id_dct[a] for a in als_emb_recom_result[0][i]],
                                       'emb_als_score':als_emb_recom_result[1][i]})
                emb_als_df_lst.append(tmp_df)

            feature_emb_als_df = pd.concat(emb_als_df_lst).reset_index(drop=True)
            feature_emb_als_df.to_parquet(filepath, compression='gzip')
            
        return feature_emb_als_df
    
    def get_top_feature_item_df(self,feature_item_df):
        tmp_df = self.train_df.iloc[-500000:].copy().reset_index(drop=True)
        top_feature_item_df = feature_item_df[feature_item_df['item_id'].isin(
            tmp_df[tmp_df['timespent']>0]['item_id'].unique())]
        return top_feature_item_df
    
    def get_test_users(self):
        return self.test_user_df.user_id.unique()
    
    def get_train_df(self):
        return self.train_df
    
    def get_production_flg(self):
        return self.production_flg
    
    def get_target_df(self):
        return self.target_df
    
    def get_path_to_save(self):
        return self.path_to_save
    
    def make_result_df(self, input_df, feature_item_df, feature_als_df, feature_source_df,
                       feature_emb_als_df, feature_emb_als_tune_df, feature_item_emb_cosine_df, 
                       feature_source_user_df):
        result_df = input_df[['user_id','item_id']].drop_duplicates().reset_index(drop=True)

        train_df = self.train_df

        train_df['already'] = 1
        result_df = result_df.merge(
                train_df[train_df['user_id'].isin(result_df['user_id'].unique()) &
                         train_df['item_id'].isin(result_df['item_id'])][
                    ['user_id','item_id','already']], how = 'left').fillna(0).reset_index(drop=True)

        result_df = result_df[result_df['already']==0].drop('already', axis = 1).reset_index(drop=True)
        result_df['source_id'] = result_df['item_id'].map(self.item_source_dct)


        result_df = result_df.merge(feature_item_df[feature_item_df['item_id'].isin(result_df['item_id'])], how = 'left') \
                 .merge(feature_source_user_df[
                         feature_source_user_df['user_id'].isin(result_df['user_id'].unique()) &
                         feature_source_user_df['source_id'].isin(result_df['source_id'])], 
                        how = 'left').reset_index(drop=True).fillna(0)

        result_df = result_df.merge(feature_als_df[feature_als_df['user_id'].isin(result_df['user_id'])], how = 'left') \
                             .merge(feature_emb_als_df[feature_emb_als_df['user_id'].isin(result_df['user_id'])], 
                                    how = 'left').fillna(0).reset_index(drop=True)
        result_df = result_df.merge(feature_emb_als_tune_df[feature_emb_als_tune_df['user_id'].isin(
            result_df['user_id'])], how = 'left').fillna(0).reset_index(drop=True)
        result_df = result_df.merge(feature_item_emb_cosine_df[feature_item_emb_cosine_df['user_id'].isin(result_df['user_id'])], 
                                    how = 'left').fillna(0).reset_index(drop=True)

        result_df['source_good_mean'] = result_df['source_id'].map(
            feature_source_df.set_index('source_id')['source_good_mean'].to_dict()).fillna(0)
        result_df['source_good_sum'] = result_df['source_id'].map(
            feature_source_df.set_index('source_id')['source_good_sum'].to_dict()).fillna(0)

        if not self.production_flg:
            result_df = result_df.merge(self.target_df[self.target_df['timespent']>0][
                ['user_id','item_id','timespent']], how = 'left').fillna(0)

        return result_df
    
    
    def main(self, data_path = './'):
        self.load_data(data_path)
        
        # features of source_id
        feature_source_df = self.get_feature_source_df()
        # features of pairs user_id - source_id
        feature_source_user_df = self.get_feature_source_user_df()
        # features of items
        feature_item_df = self.get_feature_item_df()
        # als pairs user_id, item_id, score
        feature_als_df = self.get_feature_als_df()
        # als pairs user_id, item_id, score fit from embedding vectors
        feature_emb_als_df = self.get_feature_emb_als_df()

        # other als pairs user_id, item_id, score fit from embedding vectors
        feature_emb_als_tune_df = self.get_feature_emb_als_tune_df()

        # pairs user_id - item_id for each best items
        best_150_df = self.get_best_items(feature_item_df)


        top_feature_item_df = self.get_top_feature_item_df(feature_item_df)

        # all pairs user_id - item_id for sources with positive timespents
        full_train_item_df = feature_source_user_df[feature_source_user_df['good_sum']>0].merge(
            top_feature_item_df)

        # searching best candidates by cosine similarity of embedding vectors
        feature_item_emb_cosine_df = self.get_feature_item_emb_cosine_df(top_feature_item_df)
        
        
        input_df = pd.concat([best_150_df[['user_id','item_id']], 
                      full_train_item_df[['user_id','item_id']],
                      feature_als_df[['user_id','item_id']],
                      feature_emb_als_tune_df[['user_id','item_id']],
                      feature_emb_als_df[['user_id','item_id']],
                      feature_item_emb_cosine_df[['user_id','item_id']]]).drop_duplicates().reset_index(drop=True)

        input_df = input_df[input_df['user_id'].isin(self.get_test_users())].reset_index(drop=True)
        print(input_df.shape)
        
        user_array = self.get_test_users()
        np.random.seed(33)
        np.random.shuffle(user_array)

        user_lst = np.array_split(user_array,10)

        pretrain_users = list(user_lst[0])+list(user_lst[1])
        train_users = list(user_lst[2])+list(user_lst[3])+list(user_lst[4])+list(user_lst[5])+list(user_lst[6])+list(user_lst[7])+list(user_lst[8])
        valid_users = list(user_lst[9])


        i = 0
        for tmp_user_arr in tqdm(user_lst):
            save_filepath = os.path.join(self.path_to_save, f'result_df_{i}.parquet.gzip')
            tmp_result_df = self.make_result_df(input_df[input_df['user_id'].isin(tmp_user_arr)], 
                                               feature_item_df, feature_als_df, feature_source_df,
                                           feature_emb_als_df, feature_emb_als_tune_df, feature_item_emb_cosine_df,
                                                feature_source_user_df)
            tmp_result_df = tmp_result_df.sort_values('user_id').reset_index(drop=True)
            tmp_result_df.to_parquet(save_filepath, compression='gzip')
            i+=1
     

# SAVE STUDY DATASET ON TRAIN DF ENDING on -10m rows

In [5]:
%%time

fm_old = FeatureMaking(path_to_save = '/srv/data/vk/old', 
                   iloc_start = -10000000, 
                   iloc_end = -5000000, 
                   production_flg = False)
fm_old.main()

(330649346, 2)


100%|██████████| 10/10 [25:51<00:00, 155.16s/it]

CPU times: user 25min 42s, sys: 3min 26s, total: 29min 9s
Wall time: 28min 36s





# SAVE STUDY DATASET ON TRAIN DF ENDING on -5m rows

In [7]:
%%time

fm = FeatureMaking(path_to_save = '/srv/data/vk/train', 
                   iloc_start = -5000000, 
                   iloc_end = None, 
                   production_flg = False)
fm.main()

(296480432, 2)


100%|██████████| 10/10 [22:34<00:00, 135.45s/it]

CPU times: user 22min 11s, sys: 3min 8s, total: 25min 20s
Wall time: 24min 38s





# SAVE STUDY DATASET ON TRAIN DF 

In [5]:
%%time

test_fm = FeatureMaking(path_to_save = '/srv/data/vk/test', 
                   iloc_start = None, 
                   iloc_end = None, 
                   production_flg = True)
test_fm.main()

(132615483, 2)


100%|██████████| 10/10 [09:12<00:00, 55.30s/it]

CPU times: user 9min 3s, sys: 1min 18s, total: 10min 21s
Wall time: 10min 2s



