In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
from datetime import date, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from catboost import CatBoost, CatBoostRegressor
import csv
import os
from hyperopt import hp, STATUS_OK, tpe, fmin, Trials
from hyperopt.pyll.stochastic import sample
from timeit import default_timer as timer

In [3]:
customers_tr = pd.read_csv('data_like/customer_train.csv')
stories = pd.read_csv('data_like/stories_description.csv')
reaction = pd.read_csv('data_like/stories_reaction_train.csv')
transactions = pd.read_csv('data_like/transactions.csv')
customers_te = pd.read_csv('data_like/customer_test.csv')
reaction_te = pd.read_csv('data_like/stories_reaction_test.csv')
transactions = pd.read_csv('data_like/transactions.csv')

In [5]:
def mcc_to_category(x):
    if(x in list(range(3000,3300)) + [ 4304, 4415, 4418, 4511, 4582]):
        return 'Авиабилеты'
    elif(x in [5511, 5521, 5531, 5532, 5533, 5571, 7012, 7531, 7534, 7535, 7538, 7542, 7549]):
        return 'Автоуслуги'
    elif(x in list(range(3351,3399)) + list(range(3400,3411)) + list(range(3412,3424)) + list(range(3425,3440)) + [ 3441, 7512, 7513, 7519]):
        return 'Аренда авто'
    elif(x in [5122, 5292, 5295, 5912]):
        return 'Аптеки'
    elif(x in [1520, 1711, 1731, 1740,1750, 1761, 1771, 1799, 2791, 2842, 5021, 5039, 5046, 5051, 5065, 5072, 5074, 5085, 5198, 5200, 5211, 5231, 5251, 5261, 5415, 5712, 5713, 5714, 5718, 5719, 5722, 7622, 7623, 7629, 7641, 7692, 7699]):
        return 'Дом, Ремонт'
    elif(x in [4011, 4112]):
        return 'Ж/д билеты'
    elif(x in [5995]):
        return 'Животные'
    elif(x in [5932, 5937, 5970, 5971, 5972,5973]):
        return 'Искусство'
    elif(x in [7829, 7832, 7841]):
        return 'Кино'
    elif(x in [2741, 5111, 5192, 5942, 5994]):
        return 'Книги'
    elif(x in [5977,7230,7297,7298]):
        return 'Красота'
    elif(x in [5733, 5735]):
        return 'Музыка'
    elif(x in [5094, 5137, 5139, 5611, 5621, 5631, 5641, 5651, 5661, 5681, 5691, 5697, 5698, 5699, 5931, 5944, 5949, 5950, 7296, 7631
]):
        return 'Одежда, Обувь, Ювелирные изделия и часы'
    elif(x in [7911, 7922, 7929, 7932, 7933, 7941, 7991, 7992, 7993, 7994, 7996, 7997, 7998, 7999, 8664]):
        return 'Развлечения'
    elif(x in [5811, 5812, 5813]):
        return 'Рестораны'
    elif(x in [5655, 5940, 5941]):
        return 'Спорттовары'
    elif(x in [5947]):
        return 'Сувениры'
    elif(x in [5297, 5298, 5300, 5411, 5412, 5422, 5441, 5451, 5462, 5499, 5715, 5921]):
        return 'Супермаркеты'
    elif(x in [5172, 5541, 5542, 5983]):
        return 'Топливо'
    elif(x in [4111, 4121, 4131, 4457, 4468, 4784, 4789, 5013, 5271, 5551, 5561, 5592, 5598, 5599, 7511, 7523]):
        return 'Транспорт'
    elif(x in [5814]):
        return 'Фаст фуд'
    elif(x in [5044, 5045, 5946, 7332, 7333, 7338, 7339, 7395]):
        return 'Фото, Видео'
    elif(x in [5193, 5992]):
        return 'Цветы'
    elif(x in [5309]):
        return 'Duty Free'
    
    

In [6]:
transactions['merchant_mcc'] = transactions['merchant_mcc'].apply(mcc_to_category)
transactions['merchant_mcc'].fillna('Другое', inplace=True)

In [7]:
all_transactions = transactions.groupby(['customer_id', 'merchant_mcc'])['transaction_amt'].sum()

In [8]:
all_transactions = all_transactions.unstack(level=-1).reset_index()
all_transactions = all_transactions.set_index('customer_id', drop=True)

In [9]:
customers_tr.set_index('customer_id', inplace = True)
customers_te.set_index('customer_id', inplace = True)

In [10]:
customers_tr = customers_tr.join(all_transactions)
customers_te = customers_te.join(all_transactions)

In [127]:
# all_transactions = transactions.groupby(['customer_id', 'merchant_mcc'])['transaction_amt'].count()
# all_transactions = all_transactions.unstack(level=-1).reset_index()
# all_transactions = all_transactions.set_index('customer_id', drop=True)

In [11]:
stories['name'] = stories['story_json'].apply(lambda x: ','.join(re.findall(r'"name":"([^"]+)"', x)))

In [12]:
stories['text'] = stories['story_json'].apply(lambda x: ','.join(re.findall(r'"text":"([^"]+)"', x)))

In [13]:
reaction['event_dttm'] = pd.to_datetime(reaction['event_dttm'])
reaction.set_index('event_dttm', inplace=True)
reaction['Morning'] = (reaction.index.hour<12).astype('int')
reaction['Evening'] = (reaction.index.hour>18).astype('int')
reaction['Midday'] = np.all([reaction.index.hour<=18, reaction.index.hour>=12], axis=0).astype('int')
reaction_te['event_dttm'] = pd.to_datetime(reaction_te['event_dttm'])
reaction_te.set_index('event_dttm', inplace=True)
reaction_te['Morning'] = (reaction_te.index.hour<12).astype('int')
reaction_te['Evening'] = (reaction_te.index.hour>18).astype('int')
reaction_te['Midday'] = np.all([reaction_te.index.hour<=18, reaction_te.index.hour>=12], axis=0).astype('int')

In [14]:
reaction.reset_index(inplace=True)
reaction_te.reset_index(inplace=True)

In [15]:
reaction = reaction.set_index('customer_id').join(customers_tr)
reaction_te = reaction_te.set_index('customer_id').join(customers_te)

In [236]:
reaction['life_time'] = (reaction['event_dttm'] - pd.to_datetime(reaction['first_session_dttm'])).astype('timedelta64[D]').fillna(-1).astype('int')
reaction_te['life_time'] = (reaction_te['event_dttm'] - pd.to_datetime(reaction_te['first_session_dttm'])).astype('timedelta64[D]').fillna(-1).astype('int')

In [17]:
reaction.isnull().sum()

event_dttm                                      0
story_id                                        0
event                                           0
Morning                                         0
Evening                                         0
Midday                                          0
product_0                                  460949
product_1                                  327250
product_2                                  381085
product_3                                  352388
product_4                                  450820
product_5                                   98814
product_6                                  437861
gender_cd                                   38291
age                                         38071
marital_status_cd                          259536
children_cnt                                87844
first_session_dttm                          36631
job_position_cd                             36597
job_title                                  257797


In [271]:
target = 'event'
cat_features = ['story_id', 'job_position_cd', 'marital_status_cd', 'gender_cd', 'Midday', 'Evening', 'Morning'] + ['product_{}'.format(i) for i in range(6)]
num_features = list(all_transactions.columns) + ['age', 'children_cnt', 'life_time', 'importance']

In [272]:
X_train = reaction[cat_features+num_features]
y_train = reaction[target]
X_test =  reaction_te[cat_features+num_features]


In [282]:
y_res = reaction[target]

In [274]:
X_train[list(all_transactions.columns)] = X_train[list(all_transactions.columns)].fillna(0)
X_test[list(all_transactions.columns)] = X_test[list(all_transactions.columns)].fillna(0)
X_train[cat_features] = X_train[cat_features].fillna('Na')
X_train[num_features] = X_train[num_features].fillna(-1)
X_test[cat_features] = X_test[cat_features].fillna('Na')
X_test[num_features] = X_test[num_features].fillna(-1)

In [275]:
X_train.isnull().sum()

story_id                                   0
job_position_cd                            0
marital_status_cd                          0
gender_cd                                  0
Midday                                     0
Evening                                    0
Morning                                    0
product_0                                  0
product_1                                  0
product_2                                  0
product_3                                  0
product_4                                  0
product_5                                  0
Duty Free                                  0
Авиабилеты                                 0
Автоуслуги                                 0
Аптеки                                     0
Аренда авто                                0
Дом, Ремонт                                0
Другое                                     0
Ж/д билеты                                 0
Животные                                   0
Искусство 

In [280]:
d = {'view':0.4, 'skip':-0.4, 'like':1, 'dislike':-1}
y_train = reaction[target].apply(lambda x: d[x])

In [283]:
l = {'view':0.1, 'skip':-0.1, 'like':0.5, 'dislike':-10}
y_res = y_res.apply(lambda x: l[x])

In [284]:
def metr(y_res, y_pred):
    return sum(y_pred * y_res)

In [285]:
metr(y_res,y_train)


110488.6599992044

In [289]:
from hyperopt import fmin, tpe, hp
import csv
from catboost import CatBoostRegressor
import hyperopt
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import KFold 
def cat_cv(alg, X_train, y_train,cat_feat_pos , n=3):
    skf = KFold(n_splits=n, random_state=42, shuffle=True )
    errors = []
    for train_index, test_index in skf.split(X_train, y_train):
        train, test = X_train.iloc[train_index], X_train.iloc[test_index]
        target_train, target_test = y_train.iloc[train_index], y_train.iloc[test_index]
        res = y_res.iloc[test_index]
        alg.fit(train, target_train, eval_set=(test, target_test),cat_features=cat_feat_pos)
        error = metr(res,target_test) - metr(res,alg.predict(test))
#         error = alg.get_best_score()['learn']['Logloss']
        n_trees = alg.get_best_iteration()
        errors.append(error)
    return np.mean(np.array(errors)), n_trees



global ITERATION
def objective(params, early_stopping_rounds=50):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""

    # Keep track of evals
    global ITERATION
    ITERATION += 1

    # Make sure parameters that need to be integers are integers
    for parameter_name in ['max_depth']:
        params[parameter_name] = int(params[parameter_name])

    start = timer()
    # Perform n_folds cross validation
    cat = CatBoostRegressor(iterations=4000, verbose=0,
                        has_time=True, use_best_model=True,thread_count=-1, **params)

    cv_results, n_estimators = cat_cv(cat, X_train, y_train, categorical_features_pos)
    run_time = timer() - start
    # Extract the best score
    best_score = -cv_results

    # Loss must be minimized
    loss = cv_results

    # Boosting rounds that returned the highest cv score


    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'iterations': n_estimators, 'train_time': run_time, 'status': STATUS_OK}



space = {'loss_function':'RMSE',
                           'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                           'max_depth':hp.quniform('max_depth', 3, 10, 1),
                           'eval_metric':'RMSE',
                           'bootstrap_type':'Bernoulli',
                           'subsample':0.5,
                           'l2_leaf_reg':hp.uniform('l2_leaf_reg', 0.0, 1.0),
                           'random_seed':42,
                            'od_type':'IncToDec',
                           'od_pval':0.01,
                           'max_ctr_complexity':1,
                           'random_strength':hp.uniform('random_strength', 0.0, 1),
                           'leaf_estimation_method':'Newton'
                           
                           }


def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
categorical_features_pos = column_index(X_train,cat_features)



# File to save first results
out_file = 'cat_new_feat_ap.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()


In [287]:
reaction.to_csv('reaction_train_prep.csv', index=None)
reaction_te.to_csv('reaction_test_prep.csv', index=None)

In [None]:
from hyperopt import Trials
ITERATION = 0
bayes_trials = Trials()
# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest,
            max_evals = 5, trials = bayes_trials, rstate =np.random.RandomState(42))

  0%|          | 0/5 [00:00<?, ?it/s, best loss: ?]

In [376]:
d = pd.read_csv('cat_new_feat_ap.csv')

In [381]:
d.iloc[1]['params']

"{'bootstrap_type': 'Bernoulli', 'eval_metric': 'RMSE', 'l2_leaf_reg': 0.31579959348704867, 'leaf_estimation_method': 'Newton', 'learning_rate': 0.1561278943249286, 'loss_function': 'RMSE', 'max_ctr_complexity': 1, 'max_depth': 8, 'od_pval': 0.01, 'od_type': 'IncToDec', 'random_seed': 42, 'random_strength': 0.41010395885331374, 'subsample': 0.5}"

In [382]:
params = {'bootstrap_type': 'Bernoulli', 'eval_metric': 'RMSE', 'l2_leaf_reg': 0.31579959348704867, 'leaf_estimation_method': 'Newton', 'learning_rate': 0.1561278943249286, 'loss_function': 'RMSE', 'max_ctr_complexity': 1, 'max_depth': 8, 'od_pval': 0.01, 'od_type': 'IncToDec', 'random_seed': 42, 'random_strength': 0.41010395885331374, 'subsample': 0.5}

In [383]:
model = CatBoostRegressor(iterations=1354, verbose=0,
                        has_time=True,thread_count=-1, **params)

In [384]:
model.fit(X_train, y_train, cat_features=categorical_features_pos)

<catboost.core.CatBoostRegressor at 0x1305e9d30>

In [57]:
submit = k

In [58]:
submit['score'][submit['score']<=-0.08] = -1
submit['score'][submit['score']>=0.08] = 1

In [416]:
min(submit['score'])

-1.0

In [398]:
submit['score'] = prediction
submit['answer_id'] = reaction_te['answer_id'].values
submit.sort_values('answer_id', inplace=True)

In [59]:
submit.to_csv('submit.csv', index=None)

In [55]:
k = pd.read_csv('submit2.csv')

In [28]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile

In [29]:
from nltk.tokenize import word_tokenize
import re

In [91]:
stories.head()
stories.text = stories.text + stories.name

Unnamed: 0,story_id,story_json,name,text,text_t
0,127,"{""guid"":""770a5bae-0e3f-4a6b-b924-bd87bd51a038""...",Изменить пин-код,"Как изменить ПИН-код,Зайдите с главной в меню ...","[как, изменить, пинкод, зайдите, с, главной, в..."
1,254,"{""guid"":""64f4c9ef-647b-4e04-b4d4-02297e939388""...",Пополнить «Стрелку»,"Как пополнить карту «Стрелка»,В приложении Тин...","[как, пополнить, карту, стрелка, в, приложении..."
2,865,"{""guid"":""3482206b-d223-4aec-92ba-0150055cd68a""...",Тинькофф Инвестиции 2.0,"Тинькофф Инвестиции 2.0,У нас хорошие новости ...","[тинькофф, инвестиции, у, нас, хорошие, новост..."
3,1491,"{""guid"":""5f4a9215-01de-4777-b70f-a18899db8f1c""...",Что делать на Faces & Laces,"Выставка уличной культуры,18 и 19 августа прой...","[выставка, уличной, культуры, и, августа, прой..."
4,598,"{""guid"":""ed8754bd-67be-4fa1-9289-5508d96f1fa4""...",Необычные виды спорта,"Необычные виды спорта,Если бег, бассейн и трен...","[необычные, виды, спорта, если, бег, бассейн, ..."


In [142]:
reg = re.compile('[^a-zA-Zа-яА-Я ]')
stories['text_t'] = stories['text'].apply(lambda x: [reg.sub('', w.lower()) for w in word_tokenize(x) if len(reg.sub('', w.lower())) != 0])

In [45]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
stemmer = SnowballStemmer('russian')
stories['text_t'] = stories['text_t'].apply(lambda x: [stemmer.stem(w) for w in x if not(stemmer.stem(w) in stop_words)])


In [117]:
model = Word2Vec(stories.text_t.values, size=100, window=5, min_count=1, workers=4)

In [70]:
from gensim.models import KeyedVectors
ru_emb = KeyedVectors.load_word2vec_format("cc.ru.300.vec")

In [121]:
model.most_similar('спорт')

[('забронировать', 0.9992223381996155),
 ('карта', 0.999220609664917),
 ('то', 0.9992198944091797),
 ('века', 0.999211311340332),
 ('подробный', 0.9992110133171082),
 ('самые', 0.9992109537124634),
 ('ребенка', 0.9992079734802246),
 ('подробнее', 0.9992070198059082),
 ('находится', 0.9992029666900635),
 ('эти', 0.9992011189460754)]

In [104]:
ru_emb.most_similar('авто', )

KeyError: "word 'пинкод' not in vocabulary"

In [92]:
l = list(all_transactions.columns)


In [125]:
model = gensim.models.KeyedVectors.load_word2vec_format("ruwikiruscorpora_0_300_20.bin.gz", binary=True)

In [205]:
def find_vector(w, text):
    k = 0
    s = 0
    for i in text:
        try:
            s += np.array(ru_emb.distances(i, w)).mean()
            k+=1
        except KeyError:
            continue
    return s/k     

In [161]:
ru_emb.distance(l[1],stories.text_t.iloc[0][3])

0.859527079375292

In [165]:
find_vector(l[1], stories.text_t.iloc[0])

0.9286444546460367

In [180]:

l_t = [[reg.sub('', w.lower()) for w in word_tokenize(x) if (len(reg.sub('', w.lower())) != 0 and not(stemmer.stem(w) in stop_words))] for x in l[1:]]


In [198]:
s = 0
for i in stories.text_t[0]:
    try:
        s += np.array(ru_emb.distances(i, l_t[0])).mean()
    except KeyError:
        print(i)
s

пинкод
пинкоды
пинкод
пинкод
пинкод


28.110132098197937

In [210]:
i

['авиабилеты']

In [176]:
l_t = [[stemmer.stem(w) for w in x if not(stemmer.stem(w) in stop_words)] for x in l_t]

In [209]:
l_t

[['авиабилеты'],
 ['автоуслуги'],
 ['аптеки'],
 ['аренда', 'авто'],
 ['дом', 'ремонт'],
 ['другое'],
 ['жд', 'билеты'],
 ['животные'],
 ['искусство'],
 ['кино'],
 ['книги'],
 ['красота'],
 ['музыка'],
 ['одежда', 'обувь', 'ювелирные', 'изделия', 'часы'],
 ['развлечения'],
 ['рестораны'],
 ['спорттовары'],
 ['сувениры'],
 ['супермаркеты'],
 ['топливо'],
 ['транспорт'],
 ['фаст', 'фуд'],
 ['фото', 'видео'],
 ['цветы']]

In [211]:
for i in l_t:
    stories[i[0]] = stories.text_t.apply(lambda x: find_vector(i, x))

In [228]:
stories.head()

Unnamed: 0,story_id,авиабилеты,автоуслуги,аптеки,аренда,дом,другое,жд,животные,искусство,...,развлечения,рестораны,спорттовары,сувениры,супермаркеты,топливо,транспорт,фаст,фото,цветы
0,127,0.906778,0.953502,0.90569,0.923377,0.893324,0.817001,0.918581,0.931926,0.905379,...,0.888865,0.912489,0.924507,0.91902,0.926209,0.905518,0.9181,0.948868,0.874849,0.92001
1,254,0.872409,0.907323,0.900153,0.89622,0.862606,0.856723,0.885283,0.915197,0.896501,...,0.879303,0.878094,0.893845,0.893199,0.897436,0.855328,0.849911,0.910641,0.873019,0.895175
2,865,0.8975,0.90992,0.895214,0.883715,0.870783,0.833544,0.899797,0.90187,0.883789,...,0.881262,0.881879,0.9179,0.888984,0.881572,0.86248,0.881422,0.925201,0.882843,0.883104
3,1491,0.913072,0.91936,0.916075,0.906465,0.882793,0.853376,0.905647,0.873068,0.867013,...,0.878321,0.875934,0.890458,0.865709,0.891342,0.896931,0.899548,0.924245,0.884947,0.857741
4,598,0.929584,0.925462,0.926762,0.894105,0.883993,0.811031,0.920643,0.87864,0.839577,...,0.854622,0.883314,0.886664,0.873004,0.898451,0.877529,0.890146,0.916936,0.88354,0.87468


In [220]:
reaction.join(stories)

SyntaxError: invalid syntax (<ipython-input-220-f8467792a5ec>, line 1)

In [231]:
reaction = reaction.reset_index().set_index('story_id').join(stories.set_index('story_id'))
reaction_te = reaction_te.reset_index().set_index('story_id').join(stories.set_index('story_id'))

In [232]:
reaction['importance'] = 0
reaction_te['importance'] = 0

In [238]:
reaction[list(all_transactions.columns)] = reaction[list(all_transactions.columns)].fillna(0)
reaction_te[list(all_transactions.columns)] = reaction_te[list(all_transactions.columns)].fillna(0)

In [242]:
reaction['sum'] = reaction[list(all_transactions.columns)].sum(axis=1)
reaction_te['sum'] = reaction_te[list(all_transactions.columns)].sum(axis=1)

In [243]:
reaction.head()

Unnamed: 0_level_0,customer_id,event_dttm,event,Morning,Evening,Midday,product_0,product_1,product_2,product_3,...,спорттовары,сувениры,супермаркеты,топливо,транспорт,фаст,фото,цветы,importance,sum
story_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123,563,2018-07-09 18:43:41,view,0,0,1,,UTL,,,...,0.920128,0.861151,0.87378,0.897054,0.898076,0.915913,0.849842,0.862938,0,0.0
123,650,2018-07-17 01:25:08,view,1,0,0,,UTL,,,...,0.920128,0.861151,0.87378,0.897054,0.898076,0.915913,0.849842,0.862938,0,0.0
123,1724,2018-06-10 09:05:06,skip,1,0,0,UTL,UTL,,,...,0.920128,0.861151,0.87378,0.897054,0.898076,0.915913,0.849842,0.862938,0,27500.0
123,2200,2018-06-30 06:40:20,skip,1,0,0,,UTL,,,...,0.920128,0.861151,0.87378,0.897054,0.898076,0.915913,0.849842,0.862938,0,74000.0
123,2917,2018-07-25 04:46:13,view,1,0,0,,UTL,,,...,0.920128,0.861151,0.87378,0.897054,0.898076,0.915913,0.849842,0.862938,0,13000.0


In [249]:
k = list(all_transactions.columns[1:])
for i in range(len(k)):
    reaction['importance'] += reaction[k[i]] * reaction[l_t[i][0]] 
    reaction_te['importance'] += reaction_te[k[i]] * reaction_te[l_t[i][0]]
reaction['importance'] /= reaction['sum']
reaction_te['importance'] /= reaction_te['sum']

In [257]:
l_t + ['sum']

[['авиабилеты'],
 ['автоуслуги'],
 ['аптеки'],
 ['аренда', 'авто'],
 ['дом', 'ремонт'],
 ['другое'],
 ['жд', 'билеты'],
 ['животные'],
 ['искусство'],
 ['кино'],
 ['книги'],
 ['красота'],
 ['музыка'],
 ['одежда', 'обувь', 'ювелирные', 'изделия', 'часы'],
 ['развлечения'],
 ['рестораны'],
 ['спорттовары'],
 ['сувениры'],
 ['супермаркеты'],
 ['топливо'],
 ['транспорт'],
 ['фаст', 'фуд'],
 ['фото', 'видео'],
 ['цветы'],
 'sum']

In [265]:
m = ['автоуслуги', 'аптеки', 'аренда', 'дом', 'другое', 'жд', 'животные',
       'искусство', 'кино', 'книги', 'красота', 'музыка', 'одежда',
       'развлечения', 'рестораны', 'спорттовары', 'сувениры', 'супермаркеты',
       'топливо', 'транспорт', 'фаст', 'фото', 'цветы']
reaction.drop(m + ['sum'], inplace=True, axis=1)
reaction_te.drop(m + ['sum'], inplace=True, axis=1)

In [266]:
reaction

Unnamed: 0_level_0,customer_id,event_dttm,event,Morning,Evening,Midday,product_0,product_1,product_2,product_3,...,Спорттовары,Сувениры,Супермаркеты,Топливо,Транспорт,Фаст фуд,"Фото, Видео",Цветы,life_time,importance
story_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123,563,2018-07-09 18:43:41,view,0,0,1,,UTL,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1274,
123,650,2018-07-17 01:25:08,view,1,0,0,,UTL,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,
123,1724,2018-06-10 09:05:06,skip,1,0,0,UTL,UTL,,,...,0.0,0.0,14500.0,0.0,0.0,0.0,0.0,0.0,1255,0.861215
123,2200,2018-06-30 06:40:20,skip,1,0,0,,UTL,,,...,0.0,0.0,8500.0,11500.0,1500.0,0.0,0.0,0.0,2187,0.859311
123,2917,2018-07-25 04:46:13,view,1,0,0,,UTL,,,...,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1088,0.847094
123,3644,2018-06-08 23:06:22,skip,0,1,0,,UTL,,,...,0.0,0.0,11500.0,18000.0,0.0,0.0,0.0,0.0,808,0.887189
123,5385,2018-07-30 16:23:47,skip,0,0,1,UTL,UTL,,,...,0.0,0.0,10000.0,0.0,0.0,2500.0,0.0,0.0,1709,0.870500
123,5993,2018-06-16 10:11:15,skip,1,0,0,,UTL,,,...,0.0,0.0,13000.0,4500.0,1000.0,5000.0,0.0,0.0,2197,0.863703
123,7385,2018-07-31 06:54:17,skip,1,0,0,,UTL,,,...,0.0,0.0,23000.0,0.0,0.0,0.0,0.0,0.0,2234,0.874467
123,8387,2018-07-13 02:14:04,skip,1,0,0,,UTL,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.890855


In [264]:
reaction.columns


Index(['customer_id', 'event_dttm', 'event', 'Morning', 'Evening', 'Midday',
       'product_0', 'product_1', 'product_2', 'product_3', 'product_4',
       'product_5', 'product_6', 'gender_cd', 'age', 'marital_status_cd',
       'children_cnt', 'first_session_dttm', 'job_position_cd', 'job_title',
       'Duty Free', 'Авиабилеты', 'Автоуслуги', 'Аптеки', 'Аренда авто',
       'Дом, Ремонт', 'Другое', 'Ж/д билеты', 'Животные', 'Искусство', 'Кино',
       'Книги', 'Красота', 'Музыка', 'Одежда, Обувь, Ювелирные изделия и часы',
       'Развлечения', 'Рестораны', 'Спорттовары', 'Сувениры', 'Супермаркеты',
       'Топливо', 'Транспорт', 'Фаст фуд', 'Фото, Видео', 'Цветы', 'life_time',
       'автоуслуги', 'аптеки', 'аренда', 'дом', 'другое', 'жд', 'животные',
       'искусство', 'кино', 'книги', 'красота', 'музыка', 'одежда',
       'развлечения', 'рестораны', 'спорттовары', 'сувениры', 'супермаркеты',
       'топливо', 'транспорт', 'фаст', 'фото', 'цветы', 'importance', 'sum'],
      dtyp

In [269]:
reaction.reset_index(inplace=True)
reaction_te.reset_index(inplace=True)

In [270]:
reaction.head()

Unnamed: 0,story_id,customer_id,event_dttm,event,Morning,Evening,Midday,product_0,product_1,product_2,...,Спорттовары,Сувениры,Супермаркеты,Топливо,Транспорт,Фаст фуд,"Фото, Видео",Цветы,life_time,importance
0,123,563,2018-07-09 18:43:41,view,0,0,1,,UTL,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1274,
1,123,650,2018-07-17 01:25:08,view,1,0,0,,UTL,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,
2,123,1724,2018-06-10 09:05:06,skip,1,0,0,UTL,UTL,,...,0.0,0.0,14500.0,0.0,0.0,0.0,0.0,0.0,1255,0.861215
3,123,2200,2018-06-30 06:40:20,skip,1,0,0,,UTL,,...,0.0,0.0,8500.0,11500.0,1500.0,0.0,0.0,0.0,2187,0.859311
4,123,2917,2018-07-25 04:46:13,view,1,0,0,,UTL,,...,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,1088,0.847094
