In [12]:
import numpy as np
import pandas as pd
import random
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor


def random_forest():
    train_data = pd.read_csv('train_data.csv')
    dataComplete = np.array(train_data)
    n_folders = 5
    predictionsTrainAll = dataComplete[:, 1] / 100000
    dataTrainAll = dataComplete[:, 2:]
    dates = [int(i.split('T')[0]) for i in dataComplete[:, 0]]
    dates = np.asarray(dates)
    dataTrainAll = np.hstack((dates[:, np.newaxis], dataTrainAll))
    max_depthBest = -1
    n_estimatorsBest = -1
    max_featuresBest = -1
    minCost = float("inf");

    for i in range(0, 20):
        max_depthCur = random.randint(1, 30)
        n_estimatorsCur = random.randint(50, 250)
        max_featuresCur = random.uniform(0.1, 1)
        curCost = 0;
        for j in range(0, n_folders):
            selectedRawsTrain = []
            selectedRawsTest  = []
            for k in range(int(j * dataTrainAll.shape[0] / n_folders), int((j + 1) * dataTrainAll.shape[0] / n_folders)):
                selectedRawsTest.append(k)
            for k in range(0, int(j * dataTrainAll.shape[0] / n_folders)):
                selectedRawsTrain.append(k)
            for k in range(int((j + 1) * dataTrainAll.shape[0] / n_folders), dataTrainAll.shape[0]):
                selectedRawsTrain.append(k)
                
            dataTrain        = dataTrainAll[selectedRawsTrain, :]
            predictionsTrain = predictionsTrainAll[selectedRawsTrain]
            dataTest         = dataTrainAll[selectedRawsTest, :]
            predictionsTest  = predictionsTrainAll[selectedRawsTest]           
            regr_1 = RandomForestRegressor(max_depth = max_depthCur, n_estimators = n_estimatorsCur, max_features=max_featuresCur, min_samples_split=25)
            regr_1.fit(dataTrain, predictionsTrain)
            y_1 = regr_1.predict(dataTest)       
            terrorList =predictionsTest - y_1
            terror = sum(terrorList*terrorList)
            curCost += terror

        print('attempt {0}'.format(i))

        if(curCost < minCost):
            print('attempt {0}, average error {1}'.format(i, math.sqrt(curCost / dataTrainAll.shape[0])))
            minCost = curCost
            max_depthBest = max_depthCur
            n_estimatorsBest = n_estimatorsCur
            max_featuresBest = max_featuresCur
            print('parameters: max_depth {0}, n_trees {1}, number of features per split {2}'.format(max_depthBest, n_estimatorsBest, int(max_featuresBest * dataTrainAll.shape[1])))

    print('finished')
    regr_1 = RandomForestRegressor(max_depth = max_depthBest, n_estimators = n_estimatorsBest, max_features=max_featuresBest, min_samples_split=25)
    regr_1.fit(dataTrainAll, predictionsTrainAll)       
    test_data = pd.read_csv('test_data.csv')
    dataTestComplete = np.array(test_data);
    dataTest = dataTestComplete[:, 1:]
    dates = [int(i.split('T')[0]) for i in dataTestComplete[:, 0]]
    dates = np.asarray(dates)
    dataTest = np.hstack((dates[:, np.newaxis], dataTest))
    y_pred = regr_1.predict(dataTest) * 100000
    df = pd.DataFrame(y_pred, columns = ['price'])
    df.to_csv('results_randomForest.csv')

    
def neural_networks():

    train_data = pd.read_csv('train_data.csv')
    dataComplete = np.array(train_data)
    n_folders = 5
    predictionsTrainAll = dataComplete[:, 1] / 100000
    dataTrainAll = dataComplete[:, 2:]
    dates = [int(i.split('T')[0]) for i in dataComplete[:, 0]]
    dates = np.asarray(dates)
    dataTrainAll = np.hstack((dates[:, np.newaxis], dataTrainAll))

    tmean = np.mean(dataTrainAll, axis=0)
    tstd  = np.std(dataTrainAll, axis=0, dtype=np.float64)
    dataTrainAll = (dataTrainAll - tmean) / tstd
        
    hidden_layer_size1Best = -1
    hidden_layer_size2Best = -1
    hidden_layer_size3Best = -1
    max_iterBest = -1
    alphaBest = -1
    learning_rateBest = -1
    minCost = float("inf");
    for i in range(0, 20):

        hidden_layer_size1Cur = random.randint(10, 30)
        hidden_layer_size2Cur = random.randint(10, 30)
        hidden_layer_size3Cur = random.randint(10, 30)
        max_iterCur = random.randint(100, 1000)
        alphaCur = 10 ** random.uniform(-2, -6)
        learning_rateCur = 10 ** random.uniform(-3, -6) 
        curCost = 0;
            
            
        for j in range(0, n_folders):
            selectedRawsTrain = []
            selectedRawsTest  = []
            for k in range(int(j * dataTrainAll.shape[0] / n_folders), int((j + 1) * dataTrainAll.shape[0] / n_folders)):
                selectedRawsTest.append(k)
            for k in range(0, int(j * dataTrainAll.shape[0] / n_folders)):
                selectedRawsTrain.append(k)
            for k in range(int((j + 1) * dataTrainAll.shape[0] / n_folders), dataTrainAll.shape[0]):
                selectedRawsTrain.append(k)
                
            dataTrain        = dataTrainAll[selectedRawsTrain, :]
            predictionsTrain = predictionsTrainAll[selectedRawsTrain]
            dataTest         = dataTrainAll[selectedRawsTest, :]
            predictionsTest  = predictionsTrainAll[selectedRawsTest]

            regr_1 = MLPRegressor(hidden_layer_sizes=(hidden_layer_size1Cur, hidden_layer_size2Cur, hidden_layer_size3Cur), batch_size = 100, \
                                  max_iter = max_iterCur, verbose = False, validation_fraction = 0.1, alpha = alphaCur, learning_rate_init = learning_rateCur)#0.00001 * 2 ** talpha)
            #regr_1 = MLPRegressor(hidden_layer_sizes=(20, 15, 20), batch_size = 100, max_iter = 500, verbose = False, validation_fraction = 0.1, alpha = 0.0005, learning_rate_init = 0.0005)
            regr_1.fit(dataTrain, predictionsTrain)
            #regr_1 = RandomForestRegressor(max_depth = max_depthCur, n_estimators = n_estimatorsCur, max_features=max_featuresCur, min_samples_split=25)
            #regr_1.fit(dataTrain, predictionsTrain)
            y_1 = regr_1.predict(dataTest)       
            terrorList =predictionsTest - y_1
            terror = sum(terrorList*terrorList)
            curCost += terror
            #print('attempt {0}: folder {1}'.format(i, j))

        if(curCost < minCost):
            print('attempt {0}, average error {1}'.format(i, math.sqrt(curCost / dataTrainAll.shape[0])))
            minCost = curCost
            hidden_layer_size1Best = hidden_layer_size1Cur
            hidden_layer_size2Best = hidden_layer_size2Cur
            hidden_layer_size3Best = hidden_layer_size3Cur
            max_iterBest = max_iterCur
            alphaBest = alphaCur
            learning_rateBest = learning_rateCur
            print('parameters: layers {0}-{1}-{2}, epochs {3}, L2 {4}, learning_rate {5}'.format(hidden_layer_size1Best, hidden_layer_size2Best, \
                  hidden_layer_size3Best, max_iterBest, alphaBest, learning_rateBest))

    regr_1 = MLPRegressor(hidden_layer_sizes=(hidden_layer_size1Best, hidden_layer_size2Best, hidden_layer_size3Best), batch_size = 100, \
                          max_iter = max_iterBest, verbose = False, validation_fraction = 0.1, alpha = alphaBest, learning_rate_init = learning_rateBest)
    regr_1.fit(dataTrainAll, predictionsTrainAll)       
    test_data = pd.read_csv('test_data.csv')
    dataTestComplete = np.array(test_data);
    dataTest = dataTestComplete[:, 1:]
    dates = [int(i.split('T')[0]) for i in dataTestComplete[:, 0]]
    dates = np.asarray(dates)
    dataTest = np.hstack((dates[:, np.newaxis], dataTest))
    dataTest  = (dataTest  - tmean) / tstd
    y_pred = regr_1.predict(dataTest) * 100000
    df = pd.DataFrame(y_pred, columns = ['price'])
    df.to_csv('results_deepLearning.csv')
    
    
random_forest()
neural_networks()

attempt 0
attempt 0, average error 1.5466368797475483
parameters: max_depth 10, n_trees 244, number of features per split 3
attempt 1
attempt 2
attempt 2, average error 1.390881003538186
parameters: max_depth 16, n_trees 237, number of features per split 6
attempt 3
attempt 4
attempt 5
attempt 5, average error 1.3651359519472008
parameters: max_depth 12, n_trees 195, number of features per split 16
attempt 6
attempt 6, average error 1.3573334826502863
parameters: max_depth 26, n_trees 159, number of features per split 9
attempt 7
attempt 8
attempt 9
attempt 10
attempt 11
attempt 12
attempt 13
attempt 14
attempt 15
attempt 16
attempt 17
attempt 17, average error 1.3542418696405762
parameters: max_depth 17, n_trees 219, number of features per split 9
attempt 18
attempt 19
attempt 19, average error 1.3539234672479161
parameters: max_depth 25, n_trees 168, number of features per split 10
finished
attempt 0, average error 1.2970889908013004
parameters: layers 25-17-24, epochs 980, L2 8.5926



attempt 4, average error 1.2640874623100375
parameters: layers 22-17-10, epochs 680, L2 0.00027453992211832156, learning_rate 0.0008431443403465931




attempt 7, average error 1.2627302273679135
parameters: layers 27-30-30, epochs 571, L2 0.0037701118194159713, learning_rate 0.0002118305675106496
attempt 8, average error 1.246605076011862
parameters: layers 29-26-11, epochs 688, L2 2.8023084192813437e-06, learning_rate 0.0001256440405645265


