In [1]:
#james chartouni
import lightgbm as lgb
import pandas as pd
import numpy as np
from numpy import inf
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.metrics import log_loss

In [2]:
training_data = pd.read_csv("cleaned_input/train_consolidated.csv")
training_data = training_data.drop(['msno'], axis=1)

In [3]:
#split data 
y = training_data["is_churn"].values
X = training_data.drop(["is_churn"], axis=1).values
#replaced infinite values with zero. MAKE SURE THIS IS the right thing to do 
X[X == -inf] = 0
X[X == inf] = 0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, shuffle=True)

In [4]:
#logistic regression 
logistic_pipe = make_pipeline(LogisticRegression(solver='liblinear', n_jobs=-1))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#logistic regression w/ polynomial features 2   
logistic_pipe = make_pipeline(PolynomialFeatures(degree=2), LogisticRegression(solver='liblinear', n_jobs=-1))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#logistic regression w/ polynomial features 3  
logistic_pipe = make_pipeline(PolynomialFeatures(degree=3), LogisticRegression(solver='liblinear', n_jobs=-1))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#logistic regression w/th PCA, Polynomial features 
logistic_pipe = make_pipeline(PCA(n_components=.95), PolynomialFeatures(degree=3), LogisticRegression(solver='liblinear', n_jobs=-1))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#logistic regression 
logistic_pipe = make_pipeline(LinearDiscriminantAnalysis(), LogisticRegression(solver='liblinear', n_jobs=-1))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#logistic regression 
logistic_pipe = make_pipeline(QuadraticDiscriminantAnalysis(), LogisticRegression(solver='liblinear', n_jobs=-1))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))




Accuracy: -0.56 (+/- 0.5453)


In [40]:
#Light GBM 
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)




# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'njobs': -1
}
# generate a feature name
feature_name = X_train.dtype.names

#cross validation
num_round = 50
scores = lgb.cv(params, lgb_train, num_round, nfold=5, verbose_eval=True)




[1]	cv_agg's binary_logloss: 0.644426 + 2.18444e-06
[2]	cv_agg's binary_logloss: 0.600342 + 4.42969e-06
[3]	cv_agg's binary_logloss: 0.56026 + 6.61193e-06
[4]	cv_agg's binary_logloss: 0.523668 + 8.77079e-06
[5]	cv_agg's binary_logloss: 0.490143 + 8.83851e-06
[6]	cv_agg's binary_logloss: 0.45933 + 1.00787e-05
[7]	cv_agg's binary_logloss: 0.433173 + 1.1764e-05
[8]	cv_agg's binary_logloss: 0.409016 + 1.76675e-05
[9]	cv_agg's binary_logloss: 0.384395 + 1.68195e-05
[10]	cv_agg's binary_logloss: 0.361561 + 1.68567e-05
[11]	cv_agg's binary_logloss: 0.340343 + 1.6456e-05
[12]	cv_agg's binary_logloss: 0.320596 + 1.71062e-05
[13]	cv_agg's binary_logloss: 0.302189 + 1.76867e-05
[14]	cv_agg's binary_logloss: 0.285009 + 1.86381e-05
[15]	cv_agg's binary_logloss: 0.26895 + 1.86471e-05
[16]	cv_agg's binary_logloss: 0.253926 + 1.97934e-05
[17]	cv_agg's binary_logloss: 0.239855 + 2.10112e-05
[18]	cv_agg's binary_logloss: 0.226661 + 2.19625e-05
[19]	cv_agg's binary_logloss: 0.214279 + 2.29504e-05
[20]	cv

In [34]:
print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50,
                valid_sets=lgb_train,  # eval training data
              )




Start training...
[1]	training's binary_logloss: 0.644421
[2]	training's binary_logloss: 0.600332
[3]	training's binary_logloss: 0.560244
[4]	training's binary_logloss: 0.523647
[5]	training's binary_logloss: 0.490118
[6]	training's binary_logloss: 0.459299
[7]	training's binary_logloss: 0.433141
[8]	training's binary_logloss: 0.408971
[9]	training's binary_logloss: 0.384348
[10]	training's binary_logloss: 0.361511
[11]	training's binary_logloss: 0.340291
[12]	training's binary_logloss: 0.320542
[13]	training's binary_logloss: 0.302131
[14]	training's binary_logloss: 0.284948
[15]	training's binary_logloss: 0.268888
[16]	training's binary_logloss: 0.25386
[17]	training's binary_logloss: 0.239787
[18]	training's binary_logloss: 0.226588
[19]	training's binary_logloss: 0.214203
[20]	training's binary_logloss: 0.202571
[21]	training's binary_logloss: 0.191637
[22]	training's binary_logloss: 0.181352
[23]	training's binary_logloss: 0.171672
[24]	training's binary_logloss: 0.162554
[25]	tra

In [43]:
print("--------------------------------------------------------------")
ypred = gbm.predict(X_test)
print(log_loss(y_test, ypred))
# save model to file
gbm.save_model('model.txt')

# feature names
print('Feature names:', gbm.feature_name())

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

--------------------------------------------------------------
0.0422455120945
Feature names: ['Column_0', 'Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5', 'Column_6', 'Column_7', 'Column_8', 'Column_9', 'Column_10', 'Column_11', 'Column_12', 'Column_13', 'Column_14', 'Column_15', 'Column_16', 'Column_17', 'Column_18', 'Column_19', 'Column_20', 'Column_21', 'Column_22', 'Column_23', 'Column_24', 'Column_25', 'Column_26', 'Column_27', 'Column_28', 'Column_29', 'Column_30', 'Column_31', 'Column_32', 'Column_33', 'Column_34', 'Column_35', 'Column_36', 'Column_37', 'Column_38', 'Column_39', 'Column_40', 'Column_41']
Feature importances: [94, 103, 92, 56, 62, 27, 36, 49, 24, 156, 38, 1, 17, 4, 63, 23, 15, 7, 21, 39, 20, 26, 0, 0, 32, 98, 26, 101, 17, 46, 3, 72, 0, 4, 5, 6, 27, 4, 31, 27, 14, 14]
