# Pipeline for final model

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import pandas as pd
from datetime import datetime, date, time, timedelta
from sklearn.preprocessing import OneHotEncoder
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
import numpy as np

In [73]:
model_rf = pickle.load(open('finalized_model.sav', 'rb'))
train = pd.read_csv('data/X.csv', index_col='Unnamed: 0')


In [74]:
submission = pd.read_csv('data/sample_submission.csv')

In [75]:
X = pd.read_csv('data/test.csv')
test_id = X.TransactionId

X['TransactionStartTime'] = pd.to_datetime(X['TransactionStartTime'], format='%Y-%m-%dT%H:%M:%SZ')
X['Hour'] = X['TransactionStartTime'].dt.hour

X.loc[X['Amount'] >= 0, 'DirectionOfMoney'] = 0
X.loc[X['Amount'] < 0, 'DirectionOfMoney'] = 1

cat_var = ['PricingStrategy', 'ProviderId', 'ProductId', 'ChannelId', 'ProductCategory', 'Hour']
con_variables = ['Value', 'DirectionOfMoney']
features_cat = pd.get_dummies(X[cat_var], columns=cat_var)
features_cat
data = features_cat.merge(X[con_variables], left_index=True, right_index=True, how='inner')
data.columns

Index(['PricingStrategy_0', 'PricingStrategy_1', 'PricingStrategy_2',
       'PricingStrategy_4', 'ProviderId_ProviderId_1',
       'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3',
       'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5',
       'ProviderId_ProviderId_6', 'ProductId_ProductId_1',
       'ProductId_ProductId_10', 'ProductId_ProductId_11',
       'ProductId_ProductId_13', 'ProductId_ProductId_14',
       'ProductId_ProductId_15', 'ProductId_ProductId_16',
       'ProductId_ProductId_17', 'ProductId_ProductId_18',
       'ProductId_ProductId_19', 'ProductId_ProductId_2',
       'ProductId_ProductId_20', 'ProductId_ProductId_21',
       'ProductId_ProductId_22', 'ProductId_ProductId_23',
       'ProductId_ProductId_24', 'ProductId_ProductId_25',
       'ProductId_ProductId_26', 'ProductId_ProductId_27',
       'ProductId_ProductId_3', 'ProductId_ProductId_4',
       'ProductId_ProductId_5', 'ProductId_ProductId_6',
       'ProductId_ProductId_7', 'ProductId_Product

In [84]:
print(data.columns== train.columns)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False]


In [76]:
train.columns

Index(['PricingStrategy_0', 'PricingStrategy_1', 'PricingStrategy_2',
       'PricingStrategy_4', 'ProviderId_ProviderId_1',
       'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3',
       'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5',
       'ProviderId_ProviderId_6', 'ProductId_ProductId_1',
       'ProductId_ProductId_10', 'ProductId_ProductId_11',
       'ProductId_ProductId_13', 'ProductId_ProductId_14',
       'ProductId_ProductId_15', 'ProductId_ProductId_16',
       'ProductId_ProductId_19', 'ProductId_ProductId_2',
       'ProductId_ProductId_20', 'ProductId_ProductId_21',
       'ProductId_ProductId_22', 'ProductId_ProductId_23',
       'ProductId_ProductId_24', 'ProductId_ProductId_27',
       'ProductId_ProductId_3', 'ProductId_ProductId_4',
       'ProductId_ProductId_5', 'ProductId_ProductId_6',
       'ProductId_ProductId_7', 'ProductId_ProductId_8',
       'ProductId_ProductId_9', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3

In [82]:
predictions = model_rf.predict(data)

Feature names must be in the same order as they were in fit.

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.2s finished


In [78]:
results = pd.DataFrame({'TransactionId': test_id,
'FraudResult': predictions})


In [79]:
results.to_csv('result.csv', index=False)

In [46]:
np.savetxt("predictions.txt", delimiter=',')

TypeError: _savetxt_dispatcher() missing 1 required positional argument: 'X'

In [83]:
sum(predictions)

0

In [31]:
train.columns

Index(['Value', 'PricingStrategy', 'Hour', 'DirectionOfMoney',
       'ProviderId_ProviderId_1', 'ProviderId_ProviderId_2',
       'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4',
       'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6',
       'ProductId_ProductId_1', 'ProductId_ProductId_10',
       'ProductId_ProductId_11', 'ProductId_ProductId_13',
       'ProductId_ProductId_14', 'ProductId_ProductId_15',
       'ProductId_ProductId_16', 'ProductId_ProductId_19',
       'ProductId_ProductId_2', 'ProductId_ProductId_20',
       'ProductId_ProductId_21', 'ProductId_ProductId_22',
       'ProductId_ProductId_23', 'ProductId_ProductId_24',
       'ProductId_ProductId_27', 'ProductId_ProductId_3',
       'ProductId_ProductId_4', 'ProductId_ProductId_5',
       'ProductId_ProductId_6', 'ProductId_ProductId_7',
       'ProductId_ProductId_8', 'ProductId_ProductId_9',
       'ChannelId_ChannelId_1', 'ChannelId_ChannelId_2',
       'ChannelId_ChannelId_3', 'ChannelId_ChannelId_5',


In [32]:
train.columns

Index(['Value', 'PricingStrategy', 'Hour', 'DirectionOfMoney',
       'ProviderId_ProviderId_1', 'ProviderId_ProviderId_2',
       'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4',
       'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6',
       'ProductId_ProductId_1', 'ProductId_ProductId_10',
       'ProductId_ProductId_11', 'ProductId_ProductId_13',
       'ProductId_ProductId_14', 'ProductId_ProductId_15',
       'ProductId_ProductId_16', 'ProductId_ProductId_19',
       'ProductId_ProductId_2', 'ProductId_ProductId_20',
       'ProductId_ProductId_21', 'ProductId_ProductId_22',
       'ProductId_ProductId_23', 'ProductId_ProductId_24',
       'ProductId_ProductId_27', 'ProductId_ProductId_3',
       'ProductId_ProductId_4', 'ProductId_ProductId_5',
       'ProductId_ProductId_6', 'ProductId_ProductId_7',
       'ProductId_ProductId_8', 'ProductId_ProductId_9',
       'ChannelId_ChannelId_1', 'ChannelId_ChannelId_2',
       'ChannelId_ChannelId_3', 'ChannelId_ChannelId_5',


In [28]:
data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

In [58]:
X['TransactionStartTime'] = pd.to_datetime(X['TransactionStartTime'], format='%Y-%m-%dT%H:%M:%SZ')
X['Hour'] = X['TransactionStartTime'].dt.hour

X.loc[X['Amount'] >= 0, 'DirectionOfMoney'] = 0
X.loc[X['Amount'] < 0, 'DirectionOfMoney'] = 1

In [67]:
cat_var = ['PricingStrategy', 'ProviderId', 'ProductId', 'ChannelId', 'ProductCategory', 'Hour', 'DirectionOfMoney']
num_var = ['Value']

In [60]:
num_pipeline = Pipeline([
    
    ('imputer_num', SimpleImputer(strategy='median'))])

cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [61]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_var),
    ('cat', cat_pipeline, cat_var)
])

In [62]:

pipe_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('model_rf', model_rf)
])

In [101]:
y_train_predicted = cross_val_predict(pipe_rf, X, cv=5)

TypeError: unhashable type: 'list'