In [8]:
import numpy               as np
import matplotlib.pyplot   as plt
import matplotlib.pyplot   as pp # we will merge these
import pandas              as pd
import seaborn             as sb

from datetime              import timedelta 

from sklearn               import metrics
from sklearn.decomposition import PCA
from sklearn.cluster       import KMeans
from sklearn.mixture       import GaussianMixture
from sklearn.linear_model  import LinearRegression

from matplotlib.colors     import LogNorm

from IPython.display       import HTML, Markdown

from sklearn.metrics       import mean_squared_error

%matplotlib inline

np.random.seed(0)

import os
print(os.listdir("./input"))

['historical_transactions.csv', '.ipynb_checkpoints', 'new_merchant_transactions.csv', '.~lock.Data_Dictionary.xlsx#', 'merchants.csv', 'sample_submission.csv', 'test.csv', 'Data_Dictionary.xlsx', '.gitignore', 'train.csv']


In [2]:
train        = pd.read_csv('input/train.csv', parse_dates=['first_active_month']).set_index('card_id')
test         = pd.read_csv('input/test.csv',  parse_dates=['first_active_month']).set_index('card_id')
merchants    = pd.read_csv('input/merchants.csv').set_index('merchant_id')
transactions_old = pd.read_csv('input/historical_transactions.csv',   parse_dates = ['purchase_date'])
transactions_new = pd.read_csv('input/new_merchant_transactions.csv', parse_dates = ['purchase_date'])

print(test.loc['C_ID_c27b4f80f7', :])
test = test.drop(index='C_ID_c27b4f80f7')


print(train.shape)



first_active_month    NaT
feature_1               5
feature_2               2
feature_3               1
Name: C_ID_c27b4f80f7, dtype: object
(201917, 5)


## dates to numeric

In [3]:
def first_active_month_to_numeric():
    global train, test
    latest_date = max(train['first_active_month'].max(), test['first_active_month'].max()) + timedelta(days=1)
    train['first_active_delta_days'] = ((latest_date - train['first_active_month']).dt.days)
    test['first_active_delta_days']  = ((latest_date - test['first_active_month']).dt.days)

first_active_month_to_numeric()
display(train.head())
display(test.head())



        


Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,target,first_active_delta_days
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C_ID_92a2005557,2017-06-01,5,2,1,-0.820283,246
C_ID_3d0044924f,2017-01-01,4,1,0,0.392913,397
C_ID_d639edf6cd,2016-08-01,2,2,0,0.688056,550
C_ID_186d6a6901,2017-09-01,4,3,0,0.142495,154
C_ID_cdbd2c0db2,2017-11-01,1,3,0,-0.159749,93


Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,first_active_delta_days
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C_ID_0ab67a22ab,2017-04-01,3,3,1,307
C_ID_130fd0cbdd,2017-01-01,2,3,0,397
C_ID_b709037bc5,2017-08-01,5,1,1,185
C_ID_d27d835a9f,2017-12-01,2,1,0,63
C_ID_2b5e3df5c2,2015-12-01,5,1,1,794


## Aggregate History

In [11]:
transactions_old.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [12]:
def aggregate_data():
    global train, test
    
    # These aggregations will be performed on both new and old
    aggregation = {
#         'purchase_amount': ['sum', 'mean', 'median', 'max', 'min', 'std'],
        'purchase_amount': ['sum', 'mean'],
        'subsector_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'merchant_id': ['nunique'],
        'city_id': ['nunique'],
        'state_id': ['nunique']
    }
    
    # Do aggregations on aggregated_old and join with training and test data
    aggregated_old = transactions_old.groupby(['card_id']).agg(aggregation)
    aggregated_old.columns = ['old_' + '_'.join(col).strip() for col in aggregated_old.columns.values]
    train = train.merge(right=aggregated_old, how='left', on='card_id')
    test = test.merge(right=aggregated_old, how='left', on='card_id')

    # Do aggregations on aggregated_new and join with training and test data
    aggregated_new = transactions_new.groupby(['card_id']).agg(aggregation)
    aggregated_new.columns = ['new_' + '_'.join(col).strip() for col in aggregated_new.columns.values]
    train = train.merge(right=aggregated_new, how='left', on='card_id')
    test = test.merge(right=aggregated_new, how='left', on='card_id')
    
    # Joining the transaction_new with train created some empty cells
    # Fill them with zeros
    train = train.fillna(0)
    test = test.fillna(0)

    
aggregate_data()
display(train.head())
display(test.head())

Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,target,first_active_delta_days,old_purchase_amount_sum,old_purchase_amount_mean,old_subsector_id_nunique,old_merchant_category_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_subsector_id_nunique,new_merchant_category_id_nunique,old_merchant_id_nunique,new_merchant_id_nunique,old_city_id_nunique,old_state_id_nunique,new_city_id_nunique,new_state_id_nunique
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
C_ID_92a2005557,2017-06-01,5,2,1,-0.820283,246,-165.968739,-0.638341,21,41,-13.244202,-0.575835,10.0,14.0,94,23.0,7,3,3.0,1.0
C_ID_3d0044924f,2017-01-01,4,1,0,0.392913,397,-210.006336,-0.600018,24,57,-4.355735,-0.725956,4.0,5.0,142,6.0,9,3,1.0,1.0
C_ID_d639edf6cd,2016-08-01,2,2,0,0.688056,550,-29.167391,-0.678311,7,8,-0.700326,-0.700326,1.0,1.0,13,1.0,5,2,1.0,1.0
C_ID_186d6a6901,2017-09-01,4,3,0,0.142495,154,-49.491364,-0.642745,13,25,-4.654372,-0.66491,5.0,6.0,50,7.0,7,5,2.0,2.0
C_ID_cdbd2c0db2,2017-11-01,1,3,0,-0.159749,93,-48.687656,-0.366073,17,26,-19.926237,-0.553507,10.0,17.0,66,36.0,6,6,5.0,5.0


Unnamed: 0_level_0,first_active_month,feature_1,feature_2,feature_3,first_active_delta_days,old_purchase_amount_sum,old_purchase_amount_mean,old_subsector_id_nunique,old_merchant_category_id_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_subsector_id_nunique,new_merchant_category_id_nunique,old_merchant_id_nunique,new_merchant_id_nunique,old_city_id_nunique,old_state_id_nunique,new_city_id_nunique,new_state_id_nunique
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C_ID_0ab67a22ab,2017-04-01,3,3,1,307,-40.733733,-0.599025,12,16,-1.777156,-0.592385,3.0,3.0,24,3.0,7,3,3.0,1.0
C_ID_130fd0cbdd,2017-01-01,2,3,0,397,-49.136513,-0.629955,12,16,-5.944698,-0.660522,6.0,8.0,27,9.0,4,3,2.0,2.0
C_ID_b709037bc5,2017-08-01,5,1,1,185,4.52884,0.348372,6,8,0.180138,0.090069,2.0,2.0,9,2.0,4,4,2.0,2.0
C_ID_d27d835a9f,2017-12-01,2,1,0,63,-13.690715,-0.526566,11,18,-5.743674,-0.574367,8.0,10.0,23,10.0,1,1,3.0,3.0
C_ID_2b5e3df5c2,2015-12-01,5,1,1,794,25.139384,0.22854,15,31,12.064997,2.010833,4.0,5.0,47,6.0,5,4,2.0,2.0


In [13]:
def feature_play():
    #This is just to play around with different engineered features. 
    training = train[['feature_1', 'feature_2', 'feature_3', 
                      'first_active_delta_days', 'old_purchase_amount_mean',
                     'old_subsector_id_nunique', 'old_merchant_category_id_nunique', 
                      'old_merchant_id_nunique', 'old_city_id_nunique', 'old_state_id_nunique']]
    labels = train[['target']]
    
    data_len = len(training)
    mask = np.random.rand(data_len) < 0.8

    train_data = training.values[mask]
    train_labels = labels.values.flatten()[mask]
    test_data = training.values[~mask]
    test_labels = labels.values.flatten()[~mask]

    X = train_data
    y = train_labels

    X_test = test_data
    y_test = test_labels

    reg = LinearRegression()
    reg.fit(X, y)

    y_pred_reg = reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_reg)

    y_mean = train_labels.mean()
    y_test_mean = test_labels.mean()

    print("After doing feature enginnering, the Mean Squared Error is: {:0.3f}".format(mse))
    
    return mse


play_mse = feature_play()

After doing feature enginnering, the Mean Squared Error is: 15.118


In [None]:
# Try using pandas and then convert to ndarray
train = pd.read_csv('./input/train.csv', sep=',')
test = pd.read_csv('./input/test.csv', sep=',')

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(train.target.values, bins=50)
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

In [None]:
display(train_df.head())
display(test_df.head())

In [None]:
# Remove non-feature data for training
training_df = train_df[['feature_1', 'feature_2', 'feature_3']]
labels_df = train_df[['target']]
test_df = test_df

display(X_df.head())
display(y_df.head())


data_len = len(training_df)
train_data = training_df.values[:int(data_len*.8)]
train_labels = labels_df.values.flatten()[:int(data_len*.8)]
test_data = training_df.values[int(data_len*.8):]
test_labels = labels_df.values.flatten()[int(data_len*.8):]


# X = training_df.values
# y = labels_df.values.flatten()
# X_test = test_df.values

# print(X.shape)
# print(y.shape)
# print(X_test.shape)

print(data_len)
print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_labels.shape)
print(train_data.shape[0] + test_data.shape[0])

In [None]:

test = [1,2,3,4,5]
print(test[:int(len(test)*.4)])
print(test[int(len(test)*.4):])

