In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


from sklearn.decomposition import PCA, FastICA
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import pickle

import torch
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [3]:
def data_with_val():
    print('generating data......')
    # read datasets
    train_total = pd.read_csv('../data/train.csv') ## Shape train: (4209, 378)
    X_test = pd.read_csv('../data/test.csv') ## Shape test: (4209, 377)

    # Shuffle data
    l = [x for x in range(4209)]
    np.random.shuffle(l)
    train_total = train_total.iloc[l]

    # split data
    ratio = 0.7
    threshold = int(ratio*4209)
    X_train = train_total.iloc[range(threshold)] 
    val = train_total.iloc[range(threshold, 4209)]
    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)
    y_val = val['y']
    X_val = val.drop('y', axis = 1)

    # process type
    for c in train_total.columns:
        if train_total[c].dtype == 'object':
            lbl = LabelEncoder() 
            lbl.fit(list(train_total[c].values) + list(X_test[c].values)) 
            X_train[c] = lbl.transform(list(X_train[c].values))
            X_val[c] = lbl.transform(list(X_val[c].values))
            X_test[c] = lbl.transform(list(X_test[c].values))

    # shape        
    print('Shape X_train:', X_train.shape)
    print('Shape X_test:', X_test.shape)
    print('Shape X_val:', X_val.shape )
    return X_train, y_train, X_val, y_val, X_test

def data():
    print('generating data......')
    # read datasets
    X_train = pd.read_csv('../data/train.csv') ## Shape train: (4209, 378)
    X_test = pd.read_csv('../data/test.csv') ## Shape test: (4209, 377)

    # Shuffle data
    l = [x for x in range(4209)]
    np.random.shuffle(l)
    X_train = X_train.iloc[l]

    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)

    # process type
    for c in X_train.columns:
        if X_train[c].dtype == 'object':
            lbl = LabelEncoder() 
            lbl.fit(list(X_train[c].values) + list(X_test[c].values)) 
            X_train[c] = lbl.transform(list(X_train[c].values))
            X_test[c] = lbl.transform(list(X_test[c].values))

    # shape        
    print('Shape X_train:', X_train.shape)
    print('Shape X_test:', X_test.shape)
    return X_train, y_train, X_test


def turn():
    boost = xgb.XGBRegressor()
    print('trunning model.....')
    parameters = {'learning_rate': [0.005],
                  'gamma': [0,0.5],
                  'max_depth': [4, 9],
                  'min_child_weight': [1,5],
                  "subsample": [0.6,1],
                  'colsample_bytree': [0.6,1],
                 }
    reg = RandomizedSearchCV(boost, parameters, n_jobs=8, cv=3, verbose = 1)
    reg.fit(X_train, y_train)
    best_parameters, score, _ = max(reg.grid_scores_, key=lambda x: x[1])
    print(score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
    pickle.dump( reg.best_params_, open("bestpara.p", "wb" ))
    return reg.best_params_
    
def gradient_boost(trun = False, cv = False):
    print('trainning model......')
    xgb_params = {'n_estimators': 600, 
                  'learning_rate': 0.05,
                  'max_depth': 4,
                  'subsample': 0.95,
                  'objective': 'reg:linear',
                  #'eval_metric': 'rmse',
                  'base_score': np.mean(y_train),
                  #'silent': 1
                 }
    if trun:
        xgb_params = turn()
    boost = xgb.XGBRegressor(**xgb_params)
    boost.fit(X_train, y_train)
    print('model trained, computing score.....')
    if cv:
        scores = cross_val_score(boost, X_train, y_train, cv=3, scoring = 'neg_mean_squared_error')
        print("CV score:", np.sqrt(np.mean(-scores)))
    else:
        print('mse:', mean_squared_error(boost.predict(X_val), y_val))
    pickle.dump( boost, open( "boost.p", "wb" ) )
    return boost

def NN():
    enc = OneHotEncoder(dtype = 'float32')
    enc.fit(X_train)  
    X  = enc.transform(X_train)
    y = y_train/(np.max(y_train))
    y = np.float32(y.as_matrix())
    X = Variable(torch.from_numpy(X.toarray()))
    y = Variable(torch.from_numpy(y))
    
    _,n0 = X.data.numpy().shape
    n1,n2,n3 = 30, 20,1 # layer numbers
    net = torch.nn.Sequential(
        torch.nn.Linear(n0, n1),
        torch.nn.ReLU(),    
        torch.nn.Linear(n1, n2),
        torch.nn.ReLU(),
        torch.nn.Linear(n2,n3),
    )

    optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
    loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

    for t in range(1000):
        prediction = net(X)     # input X and predict based on X
        loss = loss_func(prediction, y)     # must be (1. nn output, 2. target)
        if t % 50 ==0: print(loss)
        optimizer.zero_grad()   # clear gradients for next train
        loss.backward()         # backpropagation, compute gradients
        optimizer.step()        # apply gradients

    return net

def output():
    print('generating output file......')
    y_pred = model.predict(X_test)
    output = pd.DataFrame({'id': X_test['ID'].astype(np.int32), 'y': y_pred})
    output.to_csv('submition.csv', index=False)
    print('File upgraded!')
     

In [4]:
############################################################
# gloable name: X_train,y_train,X_val,y_val, X_test, modle #
############################################################

#X_train,y_train,X_val,y_val, X_test = data_with_val()
X_train,y_train, X_test = data()

#y_train = y_train/(np.max(y_train))
model1 = gradient_boost(cv = True)
#model2 = NN()
#output()

generating data......
Shape X_train: (4209, 377)
Shape X_test: (4209, 377)
trainning model......
model trained, computing score.....
CV score: 8.79750964363


In [55]:
boost = pickle.load( open( "boost.p", "rb" ) )