In [171]:
import warnings
warnings.filterwarnings('ignore')

In [174]:
%run Functions.py

## Add ICA, PCA, Random Projections Features, and TruncatedSVD

In [26]:
train = pd.read_csv('data/train_c.csv')
test = pd.read_csv('data/test_c.csv')

In [65]:
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

In [29]:
n_comp = 12

In [30]:
# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)
# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)
# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=42)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)
# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=42)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)


In [31]:
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]
    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]
    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]
    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

## Create Magic Features

In [32]:
cat = ['X0','X1','X2','X3','X4','X5','X6','X8']
for i in range(len(cat)):
    var = cat[i]
    magic_mat = train[['ID',var,'y']]
    magic_mat = magic_mat.groupby([var])['y'].mean()
    magic_mat = pd.DataFrame({var:magic_mat.index,var+'_magic':list(magic_mat)})
    mean_magic = magic_mat[var+'_magic'].mean()
    train = train.merge(magic_mat,on=var,how='left')
    test = test.merge(magic_mat,on=var,how = 'left')
    test[var+'_magic'] = test[var+'_magic'].fillna(mean_magic)

In [33]:
for i in range(len(cat)):
    var = cat[i]
    magic_mat = train[['ID',var,'y']]
    magic_mat = magic_mat.groupby([var])['y'].std()
    magic_mat = pd.DataFrame({var:magic_mat.index,var+'_magic_std':list(magic_mat)})
    mean_magic = magic_mat[var+'_magic_std'].std()
    train = train.merge(magic_mat,on=var,how='left')
    test = test.merge(magic_mat,on=var,how = 'left')
    test[var+'_magic_std'] = test[var+'_magic_std'].fillna(mean_magic)

## Do a feature selection through xgboost

In [48]:
y_mean = np.mean(train['y'])
params = {
    'n_trees': 520, 
    'eta': 0.005,
    'max_depth': 2,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

In [49]:
col = list(test.columns) ## Incorporating ID
label = train['y']
dtrain = xgb.DMatrix(train[col],label)
dtest = xgb.DMatrix(test[col])

In [50]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=2000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=True,seed=42)
xgb_cvalid[['train-rmse-mean', 'test-rmse-mean']].plot()
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmse:12.6387+0.305907	test-rmse:12.6282+0.598708
[50]	train-rmse:11.1523+0.32754	test-rmse:11.1405+0.673195
[100]	train-rmse:10.1435+0.347891	test-rmse:10.1321+0.72863
[150]	train-rmse:9.47972+0.364346	test-rmse:9.47044+0.765941
[200]	train-rmse:9.05075+0.378378	test-rmse:9.04694+0.785669
[250]	train-rmse:8.7762+0.38862	test-rmse:8.78385+0.793339
[300]	train-rmse:8.59701+0.398049	test-rmse:8.62441+0.798385
[350]	train-rmse:8.4738+0.406016	test-rmse:8.52402+0.798797
[400]	train-rmse:8.38387+0.410101	test-rmse:8.45537+0.799337
[450]	train-rmse:8.31582+0.412146	test-rmse:8.40832+0.799615
[500]	train-rmse:8.26299+0.413489	test-rmse:8.37494+0.801243
[550]	train-rmse:8.219+0.413384	test-rmse:8.3503+0.803029
[600]	train-rmse:8.18064+0.41274	test-rmse:8.3317+0.803265
[650]	train-rmse:8.14536+0.410891	test-rmse:8.3177+0.802247
[700]	train-rmse:8.11283+0.40837	test-rmse:8.30732+0.800963
[750]	train-rmse:8.08005+0.404617	test-rmse:8.30036+0.798889
[800]	train-rmse:8.05163+0.401973	test-

**Check Feature Importance**

In [52]:
model = xgb.train(params,dtrain,num_boost_round=987)

In [53]:
feature_score = get_feature_importance(model)

In [54]:
magic_related = [each for each in feature_score['Feature'] if 'magic' in each]

In [55]:
magic_score = feature_score[feature_score['Feature'].isin(magic_related)]

In [56]:
magic_score

Unnamed: 0,Feature,Score
24,X0_magic,1222
41,X5_magic,215
33,X0_magic_std,74
49,X2_magic,65
15,X1_magic_std,60
56,X8_magic_std,16
44,X6_magic_std,4
23,X5_magic_std,3
51,X6_magic,3
25,X3_magic_std,2


**Looks All OK. Do a Five-Fold CV Before Submit**

In [57]:
train = train.sample(frac=1,random_state=42)

In [58]:
Stack_model_6,Stack_pred_6,ID = get_xgb_stack_data(params,987,train,col,train['y'],test)

Training 1 Fold
R2 Scored of Fold 1 is 0.577613273712
RMSE of Fold 1 is 8.58192493189
Training 2 Fold
R2 Scored of Fold 2 is 0.480205599565
RMSE of Fold 2 is 9.6639529229
Training 3 Fold
R2 Scored of Fold 3 is 0.585716460473
RMSE of Fold 3 is 7.9691471779
Training 4 Fold
R2 Scored of Fold 4 is 0.656030001715
RMSE of Fold 4 is 7.09585999462
Training 5 Fold
R2 Scored of Fold 5 is 0.560108010106
RMSE of Fold 5 is 8.09931184328
Start Training
Calculating In-Bag R2 Score
0.596103833789
Calculating Out-Bag R2 Score
0.571934669114
Calculating In-Bag RMSE
8.05714794804
Calculating Out-Bag RMSE
8.28203937412


**If we remove useless magic features?**

In [59]:
removed = ['X2_magic_std','X3_magic_std','X6_magic','X5_magic_std','X6_magic_std']
col = [each for each in col if each not in removed]

In [61]:
label = train['y']
dtrain = xgb.DMatrix(train[col],label)
dtest = xgb.DMatrix(test[col])

In [62]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=2000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=True,seed=42)
xgb_cvalid[['train-rmse-mean', 'test-rmse-mean']].plot()
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmse:12.6391+0.276832	test-rmse:12.6304+0.546643
[50]	train-rmse:11.1522+0.317986	test-rmse:11.1443+0.606617
[100]	train-rmse:10.1435+0.352709	test-rmse:10.1368+0.662468
[150]	train-rmse:9.47843+0.380511	test-rmse:9.4737+0.709968
[200]	train-rmse:9.04946+0.400885	test-rmse:9.04767+0.747286
[250]	train-rmse:8.7754+0.414321	test-rmse:8.77931+0.775119
[300]	train-rmse:8.59615+0.423047	test-rmse:8.61553+0.798065
[350]	train-rmse:8.47243+0.428886	test-rmse:8.51216+0.81589
[400]	train-rmse:8.38364+0.431971	test-rmse:8.44365+0.829934
[450]	train-rmse:8.31712+0.433668	test-rmse:8.39611+0.839687
[500]	train-rmse:8.263+0.432146	test-rmse:8.36425+0.847174
[550]	train-rmse:8.21856+0.431113	test-rmse:8.33665+0.8522
[600]	train-rmse:8.17721+0.427156	test-rmse:8.3183+0.854338
[650]	train-rmse:8.14115+0.424744	test-rmse:8.30163+0.857693
[700]	train-rmse:8.10779+0.422011	test-rmse:8.28939+0.859386
[750]	train-rmse:8.07516+0.417538	test-rmse:8.27981+0.861671
[800]	train-rmse:8.04428+0.412098	t

In [63]:
Stack_model_6_2,Stack_pred_6_2,ID = get_xgb_stack_data(params,912,train,col,train['y'],test)

Training 1 Fold
R2 Scored of Fold 1 is 0.577576527585
RMSE of Fold 1 is 8.58229822208
Training 2 Fold
R2 Scored of Fold 2 is 0.480345625252
RMSE of Fold 2 is 9.66265116514
Training 3 Fold
R2 Scored of Fold 3 is 0.585072373084
RMSE of Fold 3 is 7.97533959596
Training 4 Fold
R2 Scored of Fold 4 is 0.656487966464
RMSE of Fold 4 is 7.09113467536
Training 5 Fold
R2 Scored of Fold 5 is 0.5608005359
RMSE of Fold 5 is 8.09293392347
Start Training
Calculating In-Bag R2 Score
0.593298786702
Calculating Out-Bag R2 Score
0.572056605657
Calculating In-Bag RMSE
8.0850778906
Calculating Out-Bag RMSE
8.2808715164


In [64]:
save_results(Stack_model_6_2,Stack_pred_6_2,'xgb_depth2_Rounds638_0.5721_0.5932_All_Magic_Project_ncomp12_ID_correct.csv')

## Add Truncated SVD and Original Magic Feature

In [79]:
train = pd.read_csv('data/train_c.csv')
test = pd.read_csv('data/test_c.csv')

In [80]:
n_comp = 12

In [81]:
# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)
# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)
# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=42)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)
# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=42)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)
# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

In [82]:
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]
    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]
    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]
    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

In [83]:
magic_mat = train[['ID','X0','y']]
magic_mat = magic_mat.groupby(['X0'])['y'].mean()
magic_mat = pd.DataFrame({'X0':magic_mat.index,'magic':list(magic_mat)})
mean_magic = magic_mat['magic'].mean()
train = train.merge(magic_mat,on='X0',how='left')
test = test.merge(magic_mat,on='X0',how = 'left')
test['magic'] = test['magic'].fillna(mean_magic)

In [84]:
y_mean = np.mean(train.y)
params = {
    'n_trees': 520, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

In [85]:
dtrain = xgb.DMatrix(train.drop('y', axis=1), train.y)
dtest = xgb.DMatrix(test)

In [86]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=2000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=True,seed=42)
xgb_cvalid[['train-rmse-mean', 'test-rmse-mean']].plot()
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmse:12.6395+0.179267	test-rmse:12.6371+0.352664
[50]	train-rmse:11.0716+0.217425	test-rmse:11.1468+0.412565
[100]	train-rmse:9.97346+0.245023	test-rmse:10.1388+0.467845
[150]	train-rmse:9.20947+0.25722	test-rmse:9.4792+0.517929
[200]	train-rmse:8.67615+0.253117	test-rmse:9.06343+0.555271
[250]	train-rmse:8.3063+0.245479	test-rmse:8.80711+0.584699
[300]	train-rmse:8.04632+0.237834	test-rmse:8.64871+0.60595
[350]	train-rmse:7.84561+0.22981	test-rmse:8.55732+0.620868
[400]	train-rmse:7.67191+0.221857	test-rmse:8.50526+0.631731
[450]	train-rmse:7.53252+0.21492	test-rmse:8.47599+0.636819
[500]	train-rmse:7.41873+0.209994	test-rmse:8.46139+0.64221
[550]	train-rmse:7.3249+0.202667	test-rmse:8.45713+0.645432
Performance does not improve from 581 rounds


**Stacking Data Preparation**

In [90]:
train = train.sample(frac=1,random_state=42)
col = test.columns
Stack_model_7,Stack_pred_7,ID = get_xgb_stack_data(params,581,train,col,train['y'],test)

Training 1 Fold
R2 Scored of Fold 1 is 0.569528086067
RMSE of Fold 1 is 8.6636717841
Training 2 Fold
R2 Scored of Fold 2 is 0.476111372606
RMSE of Fold 2 is 9.70193795064
Training 3 Fold
R2 Scored of Fold 3 is 0.570448436583
RMSE of Fold 3 is 8.11466621832
Training 4 Fold
R2 Scored of Fold 4 is 0.593275499618
RMSE of Fold 4 is 7.71604804341
Training 5 Fold
R2 Scored of Fold 5 is 0.553510999943
RMSE of Fold 5 is 8.15981806571
Start Training
Calculating In-Bag R2 Score
0.637825872726
Calculating Out-Bag R2 Score
0.552574878963
Calculating In-Bag RMSE
7.62966001101
Calculating Out-Bag RMSE
8.47122841244


In [91]:
save_results(Stack_model_7,Stack_pred_7,'xgb_depth4_581Rounds_All_Decomp_magic_features_0.5526_0.6378.csv')

## Try LightGBM Model

Now we can see that, features of five decomposition algorithm as well as magic feature (Only X0) and ID will be the useful feature in this case. We need to think about stacking. How about lightGBM, the algorithm that really kick-ass in seberbank regression competition?

In [92]:
import lightgbm as lgb

In [143]:
def get_lgb_stack_data(params,rounds,train,col,label,test):
    ID = []
    train = train.reset_index(drop=True)
    kf = KFold(n_splits=5,shuffle=False)
    i=0
    R2_Score = []
    RMSE = []
    for train_index, test_index in kf.split(train):
        print("Training "+str(i+1)+' Fold')
        X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
        y_train, y_test = label.iloc[train_index],label.iloc[test_index]
        train_lgb=lgb.Dataset(X_train[col],y_train)
        model = lgb.train(params,train_lgb,num_boost_round=rounds)
        pred = model.predict(X_test[col])
        X_test['label'] = list(y_test)
        X_test['predicted'] = pred
        r2 = r2_score(y_test,pred)
        rmse = MSE(y_test,pred)**0.5
        print('R2 Scored of Fold '+str(i+1)+' is '+str(r2))
        R2_Score.append(r2)
        RMSE.append(rmse)
        print('RMSE of Fold '+str(i+1)+' is '+str(rmse))
        ID.append(X_test['ID'])
        if i==0:
            Final = X_test
        else:
            Final = Final.append(X_test,ignore_index=True)
        i+=1
    lgb_train_ = lgb.Dataset(train[col],label)
    print('Start Training')
    model_ = lgb.train(params,lgb_train_,num_boost_round=rounds)
    Final_pred = model_.predict(test[col])
    Final_pred = pd.DataFrame({'ID':test['ID'],'y':Final_pred})
    print('Calculating In-Bag R2 Score')
    print(r2_score(label, model.predict(train[col])))
    print('Calculating Out-Bag R2 Score')
    print(np.mean(R2_Score))
    print('Calculating In-Bag RMSE')
    print(MSE(label, model.predict(train[col]))**0.5)
    print('Calculating Out-Bag RMSE')
    print(np.mean(RMSE))
    return Final,Final_pred,ID

In [144]:
params = {
        'objective': 'regression',
            'metric': 'rmse',
            'boosting': 'gbdt',
            'learning_rate': 0.0045 , #small learn rate, large number of iterations
            'verbose': 0,
            'num_iterations': 550,
            'bagging_fraction': 0.95,
            'bagging_freq': 1,
            'bagging_seed': 42,
            'feature_fraction': 0.95,
            'feature_fraction_seed': 42,
            'max_bin': 100,
            'max_depth': 4,
            'num_rounds': 1200
}

In [145]:
Srack_model_8,Stack_pred_8,ID = get_lgb_stack_data(params,1200,train,col,label,test)

Training 1 Fold
R2 Scored of Fold 1 is 0.576111386684
RMSE of Fold 1 is 8.5971688349
Training 2 Fold
R2 Scored of Fold 2 is 0.472315126057
RMSE of Fold 2 is 9.73702600687
Training 3 Fold
R2 Scored of Fold 3 is 0.571908964102
RMSE of Fold 3 is 8.10085904615
Training 4 Fold
R2 Scored of Fold 4 is 0.639713487108
RMSE of Fold 4 is 7.26220925047
Training 5 Fold
R2 Scored of Fold 5 is 0.550442569431
RMSE of Fold 5 is 8.18780863736
Start Training
Calculating In-Bag R2 Score
0.630834826665
Calculating Out-Bag R2 Score
0.562098306676
Calculating In-Bag RMSE
7.70294551685
Calculating Out-Bag RMSE
8.37701435515


In [147]:
save_results(Srack_model_8,Stack_pred_8,'lgb_depth4_0.56_0.63_All_decomp_Magic_X0_ID.csv')

## Final Sub Today: Stack LightGBM and Two xgb by using xgb

In [148]:
stack1,pred1 = read_data('subxgb_depth4_581Rounds_All_Decomp_magic_features_0.5526_0.6378.csv')

In [149]:
stack2,pred2 = read_data('subxbg_depth2_701Rounds_0.5634_0.6640_All_Magics.csv')

In [150]:
stack3,pred3 = read_data('sublgb_depth4_0.56_0.63_All_decomp_Magic_X0_ID.csv')

In [153]:
Stack1 = stack1[['ID','predicted']]
Stack2 = stack2[['ID','predicted']]
Stack3 = stack3[['ID','predicted','label','magic']]

In [154]:
Stack1.columns = ['ID','model1']
Stack2.columns = ['ID','model2']
Stack3.columns = ['ID','model3','label','magic']

In [155]:
Stack_train = Stack1.merge(Stack2,on='ID',how='left')
Stack_train = Stack_train.merge(Stack3,on='ID',how='left')

In [157]:
pred1.columns = ['ID','model1']
pred2.columns = ['ID','model2']
pred3.columns = ['ID','model3']

In [159]:
Stack_pred = pred1.merge(pred2,on='ID',how='left')
Stack_pred = Stack_pred.merge(pred3,on='ID',how='left')

In [162]:
## Create Magic Feature for pred_dataset
Stack_pred = Stack_pred.merge(test[['ID','X0']],on='ID',how='left')

In [163]:
Stack_pred = Stack_pred.merge(magic_mat,on='X0',how = 'left')
Stack_pred['magic'] = Stack_pred['magic'].fillna(mean_magic)

In [164]:
del Stack_pred['X0']

**Try First Linear Regression**

In [None]:
%run Functions.py

In [166]:
from sklearn.linear_model import LinearRegression
from sklearn.

In [175]:
aa,bb,cc = get_sklearn_stack_data(LinearRegression(),Stack_train,Stack_pred.columns,Stack_train['label'],Stack_pred)

R2 Scored of Fold 1 is 0.581889346373
RMSE of Fold 1 is 8.5383744583
R2 Scored of Fold 2 is 0.484759725093
RMSE of Fold 2 is 9.62152491824
R2 Scored of Fold 3 is 0.580119416192
RMSE of Fold 3 is 8.02279887253
R2 Scored of Fold 4 is 0.664346891921
RMSE of Fold 4 is 7.00954926486
R2 Scored of Fold 5 is 0.586514498755
RMSE of Fold 5 is 7.85245102854
Start Training
Calculating In-Bag R2 Score
0.577370731933
Calculating Out-Bag R2 Score
0.579525975667
Calculating In-Bag RMSE
8.24187928331
Calculating Out-Bag RMSE
8.20893970849


In [176]:
bb.to_csv('submission/Stacking_Attemp1_two_xgb_one_lgbt_plus_Linear_0.5774_0.5795.csv',index=False)