In [1]:
import pandas as pd

In [2]:
import lightgbm as lgbm

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
import numpy as np
import sklearn.ensemble as ensemble

In [4]:
item_data = pd.read_csv("processed_item_date.csv")

In [5]:
demographics = pd.read_csv('Processed_demographics_data.csv')

In [6]:
campaign_data = pd.read_csv("data/train_AUpWtIz/campaign_data.csv")
coupon_item_mapping = pd.read_csv("data/train_AUpWtIz/coupon_item_mapping.csv")
customer_transaction_data = pd.read_csv("data/train_AUpWtIz/customer_transaction_data.csv")

In [7]:
train = pd.read_csv("data/train_AUpWtIz/train.csv")
test = pd.read_csv('data/test_QyjYwdj.csv')

In [8]:
train_test = train.drop(['redemption_status'],axis=1).append(test,ignore_index=True)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78369 entries, 0 to 78368
Data columns (total 5 columns):
id                   78369 non-null int64
campaign_id          78369 non-null int64
coupon_id            78369 non-null int64
customer_id          78369 non-null int64
redemption_status    78369 non-null int64
dtypes: int64(5)
memory usage: 3.0 MB


In [10]:
train_test.loc[78364:78368]

Unnamed: 0,id,campaign_id,coupon_id,customer_id
78364,128587,8,71,1523
78365,128589,30,547,937
78366,128590,8,754,1004
78367,128592,13,134,71
78368,128595,13,681,623


In [11]:
train.tail(5)

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
78364,128587,8,71,1523,0
78365,128589,30,547,937,0
78366,128590,8,754,1004,0
78367,128592,13,134,71,0
78368,128595,13,681,623,0


In [12]:
test.head(5)

Unnamed: 0,id,campaign_id,coupon_id,customer_id
0,3,22,869,967
1,4,20,389,1566
2,5,22,981,510
3,8,25,1069,361
4,10,17,498,811


In [13]:
train_test.loc[78369:78373]

Unnamed: 0,id,campaign_id,coupon_id,customer_id
78369,3,22,869,967
78370,4,20,389,1566
78371,5,22,981,510
78372,8,25,1069,361
78373,10,17,498,811


In [14]:
train_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128595 entries, 0 to 128594
Data columns (total 4 columns):
id             128595 non-null int64
campaign_id    128595 non-null int64
coupon_id      128595 non-null int64
customer_id    128595 non-null int64
dtypes: int64(4)
memory usage: 3.9 MB


In [15]:
data_train_test = train_test.merge(campaign_data,on='campaign_id')

In [16]:
data_train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Data columns (total 7 columns):
id               128595 non-null int64
campaign_id      128595 non-null int64
coupon_id        128595 non-null int64
customer_id      128595 non-null int64
campaign_type    128595 non-null object
start_date       128595 non-null object
end_date         128595 non-null object
dtypes: int64(4), object(3)
memory usage: 7.8+ MB


In [17]:
coupon_item_mapping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92663 entries, 0 to 92662
Data columns (total 2 columns):
coupon_id    92663 non-null int64
item_id      92663 non-null int64
dtypes: int64(2)
memory usage: 1.4 MB


In [18]:
item_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74066 entries, 0 to 74065
Data columns (total 24 columns):
item_id                   74066 non-null int64
brand                     74066 non-null int64
brand_type                74066 non-null int64
item_count                74066 non-null int64
category_count            74066 non-null int64
Alcohol                   74066 non-null int64
Bakery                    74066 non-null int64
Dairy, Juices & Snacks    74066 non-null int64
Flowers & Plants          74066 non-null int64
Fuel                      74066 non-null int64
Garden                    74066 non-null int64
Grocery                   74066 non-null int64
Meat                      74066 non-null int64
Miscellaneous             74066 non-null int64
Natural Products          74066 non-null int64
Packaged Meat             74066 non-null int64
Pharmaceutical            74066 non-null int64
Prepared Food             74066 non-null int64
Restauarant               74066 non-null int6

In [19]:
data_train_test.start_date = pd.to_datetime(data_train_test.start_date)

In [20]:
data_train_test.end_date = pd.to_datetime(data_train_test.end_date)

In [21]:
train_test.shape

(128595, 4)

In [22]:
data_train_test.start_date.nunique()

25

In [30]:
campaign_train = campaign_data.loc[campaign_data.campaign_id.isin(train.campaign_id.unique())].sort_values('campaign_id')

In [35]:
campaign_train

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
21,1,Y,12/12/12,18/01/13
22,2,Y,17/12/12,18/01/13
18,3,Y,22/12/12,16/02/13
20,4,Y,07/01/13,08/02/13
19,5,Y,12/01/13,15/02/13
17,6,Y,28/01/13,01/03/13
16,7,Y,02/02/13,08/03/13
15,8,X,16/02/13,05/04/13
14,9,Y,11/03/13,12/04/13
13,10,Y,08/04/13,10/05/13


In [43]:
pd.to_datetime(customer_transaction_data.head().date)

0   2012-01-02
1   2012-01-02
2   2012-01-02
3   2012-01-02
4   2012-01-02
Name: date, dtype: datetime64[ns]

In [51]:
customer_transaction_data['date'] = pd.to_datetime(customer_transaction_data['date'])

In [52]:
customer_transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
date               1324566 non-null datetime64[ns]
customer_id        1324566 non-null int64
item_id            1324566 non-null int64
quantity           1324566 non-null int64
selling_price      1324566 non-null float64
other_discount     1324566 non-null float64
coupon_discount    1324566 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 70.7 MB


In [78]:
coupon_item_mapping.item_id.nunique()

36289

In [80]:
coupon_item_mapping.shape

(92663, 2)

In [None]:
customer_transaction_data.loc[(pd.to_datetime(customer_transaction_data.date) <=  end_date)&
                                                    (pd.to_datetime(customer_transaction_data.date) >= start_date)]

In [68]:
campaign_train.loc[(campaign_train.campaign_id==camp_id)].end_date.str.replace('/','-').values[0]

'18-01-13'

In [None]:
product_data_multivariate = []
for i in range(product_data.shape[0]):
    a = list(product_data[i])
    b = np.array([a[:22],a[22:44],a[44:66],a[66:88],a[88:110]])
    b.shape
    product_data_multivariate.append(b.T)

In [None]:
for cust_id in list(train.customer_id.nunique()):
    a = train.loc[train.customer_id==]

In [99]:
train.groupby('customer_id',as_index=False).agg({
    'campaign_id':'nunique'
}).max()

customer_id    1582
campaign_id      11
dtype: int64

In [109]:
item_data.head()

Unnamed: 0,item_id,brand,brand_type,item_count,category_count,Alcohol,Bakery,"Dairy, Juices & Snacks",Flowers & Plants,Fuel,...,Natural Products,Packaged Meat,Pharmaceutical,Prepared Food,Restauarant,Salads,Seafood,Skin & Hair Care,Travel,Vegetables (cut)
0,1,1,1,1091,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,1,1091,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13,1,1,1091,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,629,1,1,1091,14,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,632,1,1,1091,14,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [111]:
item_data.columns

Index(['item_id', 'brand', 'brand_type', 'item_count', 'category_count',
       'Alcohol', 'Bakery', 'Dairy, Juices & Snacks', 'Flowers & Plants',
       'Fuel', 'Garden', 'Grocery', 'Meat', 'Miscellaneous',
       'Natural Products', 'Packaged Meat', 'Pharmaceutical', 'Prepared Food',
       'Restauarant', 'Salads', 'Seafood', 'Skin & Hair Care', 'Travel',
       'Vegetables (cut)'],
      dtype='object')

In [120]:
coupon_item_agg = coupon_item_mapping.merge(item_data,on='item_id').groupby('coupon_id',as_index=False).agg({
    'item_id':'nunique',
    'brand':'nunique',
    'brand_type':'nunique',
    'item_count':'max',
    'category_count':'max',
    'Alcohol':'sum',
    'Bakery':'sum',
    'Dairy, Juices & Snacks':'sum',
    'Flowers & Plants':'sum',
    'Fuel':'sum',
    'Garden':'sum',
    'Grocery':'sum',
    'Meat':'sum',
    'Miscellaneous':'sum',
    'Natural Products':'sum',
    'Packaged Meat':'sum',
    'Pharmaceutical':'sum',
    'Prepared Food':'sum',
    'Restauarant':'sum',
    'Salads':'sum',
    'Seafood':'sum',
    'Skin & Hair Care':'sum',
    'Travel':'sum',
    'Vegetables (cut)':'sum'
})

In [137]:
customer_transaction_data.columns

Index(['date', 'customer_id', 'item_id', 'quantity', 'selling_price',
       'other_discount', 'coupon_discount'],
      dtype='object')

In [143]:
customer_transaction_data['month'] = customer_transaction_data['date'].dt.month
customer_transaction_data['dayofweek'] = customer_transaction_data['date'].dt.dayofweek
customer_transaction_data['day'] = customer_transaction_data['date'].dt.day
customer_transaction_data['year'] = customer_transaction_data['date'].dt.year

In [145]:
customer_transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,month,day,year,dayofweek
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,1,2,2012,0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,1,2,2012,0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0,1,2,2012,0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,1,2,2012,0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,1,2,2012,0


In [147]:
import scipy

In [146]:
customer_transaction_data.columns

Index(['date', 'customer_id', 'item_id', 'quantity', 'selling_price',
       'other_discount', 'coupon_discount', 'month', 'day', 'year',
       'dayofweek'],
      dtype='object')

In [153]:
customer_transaction_data_agg = customer_transaction_data.groupby('customer_id',as_index=False).agg({
    #'item_id':  lambda x: scipy.stats.mode(x)[0][0],
    'quantity': 'mean',
    'selling_price': 'mean',
    'other_discount': 'mean',
    'coupon_discount': 'mean',
    'month': lambda x: scipy.stats.mode(x)[0][0],
    'day': lambda x: scipy.stats.mode(x)[0][0],
    'year': lambda x: scipy.stats.mode(x)[0][0],
    'dayofweek': lambda x: scipy.stats.mode(x)[0][0]
})

In [179]:
customer_transaction_data_item = customer_transaction_data.merge(coupon_item_mapping,on='item_id').groupby('coupon_id',as_index=False).agg({
    'quantity': 'mean',
    'selling_price': 'mean',
    'other_discount': 'mean',
    'coupon_discount': 'max'
})

In [123]:
train_test.customer_id.nunique()

1582

In [124]:
crude_train_test = train_test.merge(demographics,on='customer_id',how='left').fillna(999).merge(campaign_data,on='campaign_id').drop(
    ['start_date','end_date'],axis=1)

In [125]:
demographics.customer_id.nunique()

760

In [126]:
crude_train_test1 = pd.concat([crude_train_test,pd.get_dummies(crude_train_test['campaign_type'])],axis=1).drop('campaign_type',axis=1)

In [128]:
train_test.shape

(128595, 4)

In [129]:
somewhat_crude_train_test = crude_train_test1.merge(coupon_item_agg,on='coupon_id')

In [155]:
somewhat_crude_train_test1 = somewhat_crude_train_test.merge(customer_transaction_data_agg,on='customer_id')

In [180]:
somewhat_crude_train_test2 = somewhat_crude_train_test1.merge(customer_transaction_data_item,on='coupon_id')

In [188]:
somewhat_crude_train_test2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Data columns (total 57 columns):
id                        128595 non-null int64
campaign_id               128595 non-null int64
coupon_id                 128595 non-null int64
customer_id               128595 non-null int64
rented                    128595 non-null float64
family_size               128595 non-null float64
income_bracket            128595 non-null float64
18-25                     128595 non-null float64
26-35                     128595 non-null float64
36-45                     128595 non-null float64
46-55                     128595 non-null float64
56-70                     128595 non-null float64
70+                       128595 non-null float64
single_child              128595 non-null float64
two_children              128595 non-null float64
more_than_two_children    128595 non-null float64
Married                   128595 non-null float64
Single                    128595 non-null float

In [192]:
y.min()

0

In [189]:
X = somewhat_crude_train_test2[0:78369]
y= train['redemption_status']
X_val = somewhat_crude_train_test2[78369:]

In [190]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [193]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import classification_report


In [194]:
def train_predict_print_report(est):
    est.fit(X_train, y_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, est.predict(X_test))
    print('AUC: {}'.format(auc(false_positive_rate, true_positive_rate)))
    print('ROC: {}'.format(roc_auc_score(y_test, est.predict(X_test))))
    print(classification_report(y_test, est.predict(X_test)))

In [203]:
lgbmc = lgbm.LGBMClassifier(n_estimators=60,learning_rate=0.01,num_leaves=60)
bgc = BaggingClassifier()
abc = AdaBoostClassifier()
gbc = GradientBoostingClassifier()
rfc = RandomForestClassifier()
etc = ExtraTreesClassifier()
model_list = [lgbmc, bgc, abc, gbc, rfc, etc]

In [205]:
import seaborn as sns
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from keras.utils import plot_model

Using TensorFlow backend.


In [212]:
y.shape

(78369,)

In [213]:
t_model = Sequential()
t_model.add(Dense(100, activation="relu", input_shape=(X.shape[1],)))
t_model.add(Dense(50, activation="relu"))
t_model.add(Dense(1))
t_model.compile(
    loss="mean_squared_error",
    optimizer=Adam(lr=0.001),
    metrics=[metrics.mae])

In [None]:
epochs = 50
batch = 128

#cols = list(train.columns)
#cols.remove(label_column)
history = t_model.fit(
    X_train, y_train,
    batch_size=batch,
    epochs=epochs,
    shuffle=True,
    verbose=1
)


In [221]:
score = t_model.evaluate(X_test, y_test, verbose=0)

In [222]:
score

[0.00809372097383836, 0.014941401580005351]

In [223]:
def keras_predict_print(est):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, est.predict(X_test))
    print('AUC: {}'.format(auc(false_positive_rate, true_positive_rate)))
    print('ROC: {}'.format(roc_auc_score(y_test, est.predict(X_test))))
   #print(classification_report(y_test, est.predict(X_test)))

In [None]:
keras_predict_print(t_model)

In [204]:

for clf in model_list:
    print(clf)
    train_predict_print_report(clf)
    print()
    print()

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.01, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=60, n_jobs=-1, num_leaves=60, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
AUC: 0.5
ROC: 0.5
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     25651
           1       0.00      0.00      0.00       211

   micro avg       0.99      0.99      0.99     25862
   macro avg       0.50      0.50      0.50     25862
weighted avg       0.98      0.99      0.99     25862



BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_star

  'precision', 'predicted', average, warn_for)


AUC: 0.49986355307785274
ROC: 0.49986355307785274
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     25651
           1       0.00      0.00      0.00       211

   micro avg       0.99      0.99      0.99     25862
   macro avg       0.50      0.50      0.50     25862
weighted avg       0.98      0.99      0.99     25862



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
AUC: 0.5
ROC: 0.5


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.99      1.00      1.00     25651
           1       0.00      0.00      0.00       211

   micro avg       0.99      0.99      0.99     25862
   macro avg       0.50      0.50      0.50     25862
weighted avg       0.98      0.99      0.99     25862



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
AUC: 0.5016094639659106
ROC: 0.5016094639659106
              precision    recall  f1-score   support

           0       0.99



AUC: 0.49986355307785274
ROC: 0.49986355307785274
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     25651
           1       0.00      0.00      0.00       211

   micro avg       0.99      0.99      0.99     25862
   macro avg       0.50      0.50      0.50     25862
weighted avg       0.98      0.99      0.99     25862



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)




AUC: 0.499649136485907
ROC: 0.499649136485907
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     25651
           1       0.00      0.00      0.00       211

   micro avg       0.99      0.99      0.99     25862
   macro avg       0.50      0.50      0.50     25862
weighted avg       0.98      0.99      0.99     25862





In [187]:
lgbmc.fit(X,y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [164]:
pd.read_csv('data/sample_submission_Byiv0dS.csv').head(3)

Unnamed: 0,id,redemption_status
0,3,0
1,4,0
2,5,0


In [165]:
pd.concat([test['id'],pd.DataFrame(lgbmc.predict_proba(X_val))[1]],axis=1).rename(columns={1:'redemption_status'
                                                                                          }).to_csv('Submissions/submission_somewhat_crude_join.csv',index=False)