In [11]:
#james chartouni
import lightgbm as lgb
import xgboost as xgb
import pandas as pd
import numpy as np
from numpy import inf
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.metrics import log_loss

In [12]:
training_data = pd.read_csv("cleaned_input/train_consolidated.csv")
training_data = training_data.drop(['msno'], axis=1)

In [None]:
test_data = pd.read_csv("cleaned_input/test_consolidated.csv")
#test_data = test_data.drop(['msno'], axis=1)

In [None]:
#split data 
y = training_data["is_churn"].values
X = training_data.drop(["is_churn"], axis=1).values
#replaced infinite values with zero. MAKE SURE THIS IS the right thing to do 
X[X == -inf] = 0
X[X == inf] = 0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, shuffle=True)

In [None]:
#logistic regression 
logistic_pipe = make_pipeline(LogisticRegression(solver='liblinear'))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#logistic regression w/ polynomial features 2   
logistic_pipe = make_pipeline(PolynomialFeatures(degree=2), LogisticRegression(solver='liblinear'))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

#logistic regression w/th PCA, Polynomial features 
logistic_pipe = make_pipeline(PCA(n_components=.95), PolynomialFeatures(degree=2), LogisticRegression(solver='liblinear'))
scores = cross_val_score(logistic_pipe, X_train, y_train, cv=5, scoring="neg_log_loss")
print("Accuracy: %0.2f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))


Accuracy: -0.19 (+/- 0.1852)


In [None]:
#Light GBM 
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)


# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'njobs': -1
}
# generate a feature name
feature_name = X_train.dtype.names

#cross validation
num_round = 50
scores = lgb.cv(params, lgb_train, num_round, nfold=5, verbose_eval=True)

In [None]:
feature_name


In [None]:
print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50,
                valid_sets=lgb_train,  # eval training data
              )

In [None]:
print("--------------------------------------------------------------")
ypred = gbm.predict(X_test)
print(log_loss(y_test, ypred))
# save model to file
gbm.save_model('model.txt')

# feature names
print('Feature names:', gbm.feature_name())

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

In [None]:
from sklearn import metrics

def xgb_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'log_loss', metrics.log_loss(labels, preds)

fold = 1
for i in range(fold):
    params = {
        'eta': 0.02, #use 0.002
        'max_depth': 7,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': i,
        'silent': False
    }
    watchlist = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_test, y_test), 'valid')]
    model = xgb.train(params, xgb.DMatrix(X_train, y_train), 150,  watchlist, feval=xgb_score, maximize=False, verbose_eval=50, early_stopping_rounds=50) #use 1500
    if i != 0:
        pred += model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
    else:
        pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
pred /= fold
test['is_churn'] = pred.clip(0.0000001, 0.999999)
test[['msno','is_churn']].to_csv('submission3.csv.gz', index=False, compression='gzip')