In [1]:
using Gurobi, StatsBase, CSV, DataFrames, 
    JuMP, LinearAlgebra, Distributions, Random,
    GLMNet, ScikitLearn, MLBase, CategoricalArrays, Plots,
    Dates, Clustering, Distances, StatsPlots, ProgressMeter

In [2]:
using Statistics

In [3]:
ProgressMeter.ijulia_behavior(:clear);

In [4]:
ENV["COLUMNS"] = 100;

In [5]:
const GUROBI_ENV = Gurobi.Env();

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


# Loading DataSets

In [6]:
#data scaled in Excel.
#Both scaled and original data divided into training/validation/test sets using Python and read here

In [7]:
train = CSV.read("seed 40/train3.csv", DataFrame);
train_o = CSV.read("seed 40/train_o3.csv", DataFrame);
test = CSV.read("seed 40/test3.csv", DataFrame);
test_o = CSV.read("seed 40/test_o3.csv", DataFrame);

## Working with scaled data 

In [8]:
#Working with training data
data_1 = copy(train);
data_2 = copy(train);

In [9]:
#data_1: scaled data
names(train)

11-element Vector{String}:
 "Column1"
 "neigborhood_cat"
 "room_type"
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "number_of_reviews_ltm"
 "days_since_last_review"
 "price"

In [10]:
#Removing extra columns
data_1 = select!(data_1, Not([:Column1,:number_of_reviews_ltm]));
data_2 = select!(data_2, Not([:Column1,:number_of_reviews_ltm]));

In [11]:
function one_hot_encoding(data_copy)
    data_copy[!,:Pvt_room]=zeros(size(data_copy)[1]);
    data_copy[!,:Hotel]=zeros(size(data_copy)[1]);
    data_copy[!,:Shd_room]=zeros(size(data_copy)[1]);
    data_copy[!,:High]=zeros(size(data_copy)[1]);
    data_copy[!,:Mid_High]=zeros(size(data_copy)[1]);
    data_copy[!,:Middle]=zeros(size(data_copy)[1]);
    data_copy[!,:Mid_Low]=zeros(size(data_copy)[1]);
    n= size(data_copy)[1]
    for i =1:n
        if data_copy[i,:room_type]=="Private_room"
            data_copy[i,:Pvt_room]=1
        elseif data_copy[i,:room_type]=="Hotel room"
            data_copy[i,:Hotel]=1
        elseif data_copy[i,:room_type]=="Shared room"
             data_copy[i,:Shd_room]=1
        end

        if data_copy[i,:neigborhood_cat]=="High"
            data_copy[i,:High]=1
        elseif data_copy[i,:neigborhood_cat]=="Mid-Low"
            data_copy[i,:Mid_Low]=1
        elseif data_copy[i,:neigborhood_cat]=="Middle"
            data_copy[i,:Middle]=1
        elseif data_copy[i,:neigborhood_cat]=="Mid-High"
            data_copy[i,:Mid_High]=1
        end
    end


    return data_copy
end


one_hot_encoding (generic function with 1 method)

# Holistic Regression on entire data set

In [12]:
#Fitting holistic regression on each dataset
function holistic_regression(X, y, lambda, k, M; solver_output=0)
    n,p = size(X)
       
    # Build model
    model = Model(Gurobi.Optimizer)
    set_optimizer_attribute(model, "OutputFlag", solver_output) 
    
    # Insert variables
    @variable(model, beta[j=1:p])
    @variable(model, z[j=1:p], Bin)
    @variable(model, t>=0)
    @variable(model, a[j=1:p]>=0)
    
    #Insert constraints
    #Linearize norm 1
    @constraint(model,[j=1:p], beta[j]<=a[j])
    @constraint(model,[j=1:p], -beta[j]<=a[j])
    
    #The residual term
    @constraint(model, sum((y[i]-sum(X[i,j]*beta[j] for j=1:p))^2 for i=1:n) <= t)
    
    #Binary constraints
    @constraint(model,[j=1:p], beta[j]<= M*z[j])
    @constraint(model,[j=1:p], beta[j]>= (-1)*M*z[j])
    
    #Sparsity Constraint
    @constraint(model, sum(z[j] for j=1:p)<=k)

    #Group Sparsity
    #@constraint(model, [j=1:Int64(floor(p/4))], sum(z[l] for l=(4(j-1)+1):(4(j-1)+4)) <=1) 
    
    
    #Multi-collinearity
    #@constraint(model, [l=1:size(HC_i)[1]], (z[HC_i[l]] + z[HC_j[l]])<=1)
    
    #Objective
    @objective(model, Min, t + lambda*sum(a[j] for j=1:p))
    
    # Optimize
    optimize!(model)
    
    # Return estimated betas
    return (value.(beta))
end

holistic_regression (generic function with 1 method)

In [13]:
function compute_mse(X, y, beta)
    n,p = size(X)
    return sum((X*beta .- y).^2)/n
end

compute_mse (generic function with 1 method)

In [14]:
function compute_R2(X, y, beta)
    n,p = size(X)
    y_bar=mean(y)
    SSE = sum((X*beta - y).^2)
    SST = sum((y .- y_bar).^2)
    return (1-(SSE/SST))
    
end

compute_R2 (generic function with 1 method)

In [15]:
#Hypertuning the parameter lambda

function hypertuning(lambda_vals,support,X,y; method=holistic_regression, solver_output=0)
    
    
    #we create an array to hold the results
    errors = zeros(length(lambda_vals),length(support))
    
    
    for (i,lambda) in enumerate(lambda_vals)
        for (j,k) in enumerate(support)
            
            #get the beta coefficients from the Lasso or Ridge regression
            beta = method(X,y,lambda,k,30,solver_output=solver_output)
            #compute the MSE with the optimal beta we just found
            errors[i,j] = compute_mse(Matrix(X), y, beta)
        end
    end
    
    #get the best performing lambda
    i_best = findmin(errors)[2][1]
    j_best = findmin(errors)[2][2]
    #beta_best = method(X,y,lambda_vals[i_best])
    return lambda_vals[i_best], support[j_best]
end

hypertuning (generic function with 1 method)

In [16]:
lambda_vals = [.1,.2,.25,.4,.5,.6,.75,.8,.9,1]
support = 4:14;

In [17]:
holistic_train_data= one_hot_encoding(data_1);

In [18]:
holistic_train_data= select!(holistic_train_data, Not([:neigborhood_cat,:room_type]));

In [19]:
best_lambda, best_support = hypertuning(lambda_vals,support,select(holistic_train_data, Not([:price])),holistic_train_data[:,:price])

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18

(0.1, 12)

In [20]:
beta_entire_holistic = holistic_regression(select(holistic_train_data, Not([:price])),holistic_train_data[:,:price],
    best_lambda,best_support,20);

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [21]:
entire_train_R2 = compute_R2(Matrix(select(holistic_train_data, Not([:price]))),
    holistic_train_data[:,:price],beta_entire_holistic)

0.17146118803864774

In [22]:
#Fitting holistic on test data
holistic_test_data = copy(test);
holistic_test_data= select!(holistic_test_data, Not([:Column1,:number_of_reviews_ltm]));


In [23]:
names(holistic_test_data)

9-element Vector{String}:
 "neigborhood_cat"
 "room_type"
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"

In [24]:
holistic_test_data= one_hot_encoding(holistic_test_data);

In [25]:
holistic_test_data= select!(holistic_test_data, Not([:neigborhood_cat,:room_type]));

In [26]:
entire_test_R2 = compute_R2(Matrix(select(holistic_test_data, Not([:price]))),
    holistic_test_data[:,:price],beta_entire_holistic)

0.18262770366615833

# Clustering the Data

We observe very different scales on the numerical data, so for those we can apply a min-max scaling

$$\frac{x_i-\min x_i}{\max x_i-\min x_i}$$

In [27]:
#Segregating numeric and categorical data
cat_features = [1,2]
num_features = [3,4,5,6,7,8,9];

In [28]:
num_data = data_2[:,num_features]
cat_data = data_2[:,cat_features];

## Distance Matrix

In [29]:
#Let k=13 number of cluster
k=13;

In [30]:
#select k random centroids
Random.seed!(15095)
n=size(train)[1]
random_index = rand(k);

In [31]:
random_index = Int64.(ceil.(random_index.*n));

In [32]:
centroids_data = data_2[random_index,:];

In [33]:
function distance_matrix(centroids_data,k,alpha)
    #Calculating distance matrix nXk for numerical features as euclidean distance
    n=size(data_2)[1]
    distance_num = zeros(n,k);
    for j=1:k
        for i=1:n
           for p=1:size(num_features)[1]
                distance_num[i,j] = distance_num[i,j]+ 
                (num_data[i,p]-centroids_data[j,num_features[p]])^2
            end
            distance_num[i,j]=sqrt(distance_num[i,j])
        end
    end
    #Calculating distance matrix nXk for categorical features as indicator function
    #Distance (point i and centroid k) = Number of categorical variables with same value between i and k
    distance_cat = zeros(n,k);
    for j=1:k
        for i=1:n
           for p=1:size(cat_features)[1]
                if cat_data[i,cat_features[p]]!=centroids_data[j,cat_features[p]]
                    distance_cat[i,j]+=1
                end
            end

        end
    end
    #Calculating final distance matrix as alpha*distance_num + (1-alpha)*distance_cat
    #Ideally use cross validation to find alpha but for now use alpha = 0.5
    distance_mat = alpha.*distance_num + (1-alpha).*distance_cat
    return (distance_mat)
end


distance_matrix (generic function with 1 method)

## Clustering

In [34]:
#We solve clustering as an optimization problem
#We say that clustering stops when the total distance between points and their cluster centroids cannot be
#reduced further. Tolerance = 1e-2

In [35]:
tolerance = 1e-6;
dist=0;
alpha = 0.5;

In [36]:
clusters = Int64.(zeros(n));

In [37]:
train_data_with_clusters = copy(data_2);

In [38]:
#Adding clusters column to the dataframe
train_data_with_clusters[!, :Clusters] = clusters;

In [39]:
names(train_data_with_clusters)

10-element Vector{String}:
 "neigborhood_cat"
 "room_type"
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"
 "Clusters"

In [40]:
dist_prev_iter = 1000000
dist= 999999
iterations = 0
while (dist_prev_iter-dist)>tolerance   #while not convergence
    
    dist_prev_iter = dist
    
    #Get distances between points and centroids
    distance = distance_matrix(centroids_data,k,alpha)
    
    #assign point to nearest cluster
    for i=1:n
        clusters[i]=findmin(distance[i,:])[2]
    end
    
    #updating assignments in dataframe
    train_data_with_clusters[!,:Clusters]=clusters
    
    #Calculating total distance based on cluster assignment
    dist=0
    for i=1:n
        dist = dist + distance[i,clusters[i]]
    end
    dist = dist/n
    #UPDATE CENTROIDS
    #we average numerical features for all points in the cluster 
    #For categorical variable, we find the mode within the cluster
    
    for j=1:k
        
        #sum of numerical features for points in cluster j
        temp_sum = zeros(size(num_features)[1])
        for p=1:size(num_features)[1]
           temp_sum[p]=sum(train_data_with_clusters[train_data_with_clusters[:,:Clusters].==j,num_features[p]])
        end
        #count the number of points in cluster j
        count = sum(train_data_with_clusters[:,:Clusters].==j)
    
        #updating centroid j for each numerical feature with average
        for p=1:size(num_features)[1]
            centroids_data[j,num_features[p]]=temp_sum[p]/count
        end
        
        #updating centroid j for each categorical feature with mode
        for c=1:size(cat_features)[1]
            temp_mode = StatsBase.mode(train_data_with_clusters[train_data_with_clusters[:,:Clusters].==j,cat_features[c]])
            centroids_data[j,cat_features[c]]=temp_mode
        end
        
    end
    iterations+=1
end

    
    

In [41]:
iterations

27

In [158]:
#Number of points in each cluster
for j=1:k
    count = sum(train_data_with_clusters[:,:Clusters].==j)
    print("\nCluster ", j, " : ", count)
end


Cluster 1 : 2948
Cluster 2 : 1571
Cluster 3 : 2073
Cluster 4 : 5043
Cluster 5 : 5131
Cluster 6 : 2659
Cluster 7 : 1491
Cluster 8 : 3073
Cluster 9 : 3171
Cluster 10 : 1923
Cluster 11 : 1151
Cluster 12 : 4568
Cluster 13 : 5752

## Fitting an OCT to the cluster assignments

data_oct = vcat(train_o,val_o);

size(data_oct)

typeof(clusters)

data_oct_use = select!(data_oct, Not([:Column1,:number_of_reviews_ltm]));

#data_oct_use is original unscaled data

names(data_oct_use)

data_oct_use[!,:room_type] = CategoricalArray(data_oct_use[:,:room_type]);
data_oct_use[!,:neigborhood_cat] = CategoricalArray(new_data_oct_use[:,:neigborhood_cat]);

lnr = IAI.OptimalTreeClassifier(random_seed=1234)
grid_oct = IAI.GridSearch(lnr,
    max_depth=1:3,
    minbucket=2500
)
IAI.fit!(grid_oct,data_oct_use,clusters);


lnr_oct = IAI.get_learner(grid_oct)

# Fitting regression model on each cluster

In [42]:
names(train_data_with_clusters)

10-element Vector{String}:
 "neigborhood_cat"
 "room_type"
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"
 "Clusters"

In [43]:
#Segregating data for each cluster

In [44]:
data_copy = copy(train_data_with_clusters);

In [45]:
data_copy = one_hot_encoding(data_copy);

In [46]:
names(data_copy)

17-element Vector{String}:
 "neigborhood_cat"
 "room_type"
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"
 "Clusters"
 "Pvt_room"
 "Hotel"
 "Shd_room"
 "High"
 "Mid_High"
 "Middle"
 "Mid_Low"

In [47]:
data_copy = select!(data_copy, Not([:neigborhood_cat,:room_type]));

In [48]:
#segregating xgb_train_cluster
function segregate_clusters(data,i)
    cluster_data = data[data[:,:Clusters].==i,:];
    return cluster_data
end

segregate_clusters (generic function with 1 method)

In [49]:
#Segregating cluster data
cluster_1 = segregate_clusters(data_copy,1)
cluster_2 = segregate_clusters(data_copy,2)
cluster_3 = segregate_clusters(data_copy,3)
cluster_4 = segregate_clusters(data_copy,4)
cluster_5 = segregate_clusters(data_copy,5)
cluster_6 = segregate_clusters(data_copy,6)
cluster_7 = segregate_clusters(data_copy,7)
cluster_8 = segregate_clusters(data_copy,8)
cluster_9 = segregate_clusters(data_copy,9)
cluster_10 = segregate_clusters(data_copy,10)
cluster_11 = segregate_clusters(data_copy,11)
cluster_12 = segregate_clusters(data_copy,12)
cluster_13 = segregate_clusters(data_copy,13);



In [50]:
holistic_cluster_R2=zeros(k);
i=1;
a=1;

In [51]:
#regression for cluster 1
beta_1 = holistic_regression(select(cluster_1, Not([:Clusters,:price])),cluster_1[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_1, Not([:price,:Clusters]))),
    cluster_1[:,:price],beta_1)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [52]:
#regression for cluster 2
beta_2 = holistic_regression(select(cluster_2, Not([:Clusters,:price])),cluster_2[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_2, Not([:price,:Clusters]))),
    cluster_2[:,:price],beta_2)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [53]:
#regression for cluster 3
beta_3 = holistic_regression(select(cluster_3, Not([:Clusters,:price])),cluster_3[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_3, Not([:price,:Clusters]))),
    cluster_3[:,:price],beta_3)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [54]:
#regression for cluster 4
beta_4 = holistic_regression(select(cluster_4, Not([:Clusters,:price])),cluster_4[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_4, Not([:price,:Clusters]))),
    cluster_4[:,:price],beta_4)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [55]:
#regression for cluster 5
beta_5 = holistic_regression(select(cluster_5, Not([:Clusters,:price])),cluster_5[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_5, Not([:price,:Clusters]))),
    cluster_5[:,:price],beta_5)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [56]:
#regression for cluster 6
beta_6 = holistic_regression(select(cluster_6, Not([:Clusters,:price])),cluster_6[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_6, Not([:price,:Clusters]))),
    cluster_6[:,:price],beta_6)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [57]:
#regression for cluster 7
beta_7 = holistic_regression(select(cluster_7, Not([:Clusters,:price])),cluster_7[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_7, Not([:price,:Clusters]))),
    cluster_7[:,:price],beta_7)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [58]:
#regression for cluster 8
beta_8 = holistic_regression(select(cluster_8, Not([:Clusters,:price])),cluster_8[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_8, Not([:price,:Clusters]))),
    cluster_8[:,:price],beta_8)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [59]:
#regression for cluster 9
beta_9 = holistic_regression(select(cluster_9, Not([:Clusters,:price])),cluster_9[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_9, Not([:price,:Clusters]))),
    cluster_9[:,:price],beta_9)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [60]:
#regression for cluster 10
beta_10 = holistic_regression(select(cluster_10, Not([:Clusters,:price])),cluster_10[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_10, Not([:price,:Clusters]))),
    cluster_10[:,:price],beta_10)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [61]:
#regression for cluster 11
beta_11 = holistic_regression(select(cluster_11, Not([:Clusters,:price])),cluster_11[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_11, Not([:price,:Clusters]))),
    cluster_11[:,:price],beta_11)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [62]:
#regression for cluster 12
beta_12 = holistic_regression(select(cluster_12, Not([:Clusters,:price])),cluster_12[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_12, Not([:price,:Clusters]))),
    cluster_12[:,:price],beta_12)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [63]:
#regression for cluster 13
beta_13 = holistic_regression(select(cluster_13, Not([:Clusters,:price])),cluster_13[:,:price],best_lambda,best_support,20);
holistic_cluster_R2[i]=compute_R2(Matrix(select(cluster_13, Not([:price,:Clusters]))),
    cluster_13[:,:price],beta_13)
i=i+a;

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [64]:
holistic_cluster_R2

13-element Vector{Float64}:
 0.0766697657774329
 0.05279239641338496
 0.06521945731720868
 0.037219779802967645
 0.14392706745320483
 0.0802073499066891
 0.07980649573078558
 0.20720899334898435
 0.0801659523434235
 0.019414841851183695
 0.6165562151886559
 0.05587503145732453
 0.10094953334059187

## Test set predictions and evaluation

Since the training predictions are very poor, we don't evaluate on test data

# Fitting XGBoost on training data and testing data

## XGBoost Function

In [65]:
function xgb_training(X,y)

    grid = IAI.GridSearch(
        IAI.XGBoostRegressor(
            random_seed=1,
        ),
        max_depth=range(2, 6),
        num_round=[20, 50, 100],
    )

    IAI.fit_cv!(grid,X,y,n_folds=5)
    return grid
end



xgb_training (generic function with 1 method)

## Fitting XGBoost on entire data

In [66]:
xgb_train = copy(train);

In [67]:
xgb_train = select!(xgb_train, Not([:Column1,:number_of_reviews_ltm]));

In [68]:
xgb_train = one_hot_encoding(xgb_train);

In [69]:
names(xgb_train)

16-element Vector{String}:
 "neigborhood_cat"
 "room_type"
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"
 "Pvt_room"
 "Hotel"
 "Shd_room"
 "High"
 "Mid_High"
 "Middle"
 "Mid_Low"

In [70]:
xgb_train= select!(xgb_train, Not([:neigborhood_cat,:room_type]));

In [71]:
#Fitting XGB model
xgb_grid = xgb_training(select(xgb_train, Not([:price])),xgb_train[:,:price]);

LoadError: UndefVarError: IAI not defined

In [72]:
#Looking at important variables
xgb_lnr = IAI.get_learner(xgb_grid)
IAI.variable_importance(xgb_lnr)

LoadError: UndefVarError: IAI not defined

In [251]:
#Calculating training set R2
train_R2 = IAI.score(xgb_lnr,select(xgb_train, Not([:price])),xgb_train[:,:price], criterion=:mse)

0.49233486704011376

In [254]:
#Making predictions on test set
xgb_test = copy(test)
xgb_test = select!(xgb_test, Not([:Column1,:number_of_reviews_ltm]))
xgb_test = one_hot_encoding(xgb_test)
xgb_test= select!(xgb_test, Not([:neigborhood_cat,:room_type]));

In [255]:
#Test set R2
xgb_test_R2 = IAI.score(xgb_lnr,select(xgb_test, Not([:price])),xgb_test[:,:price], criterion=:mse)

0.3723888174337785

## Predicting using XGBoost on clusters

## Segregating training data into clusters and training XGBoost for each 

In [259]:
cluster_train_R2 = zeros(k)
cluster_test_R2 = zeros(k);

In [260]:
i=1;

In [261]:
#Cluster 1
xgb_train_1 = copy(cluster_1)
xgb_grid_1 = xgb_training(select(xgb_train_1, Not([:price,:Clusters])),xgb_train_1[:,:price]);
xgb_lnr_1 = IAI.get_learner(xgb_grid_1)
R2 = IAI.score(xgb_lnr_1,select(xgb_train_1, Not([:price,:Clusters])),xgb_train_1[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [262]:
#Cluster 2
xgb_train_2 = copy(cluster_2)
xgb_grid_2 = xgb_training(select(xgb_train_2, Not([:price,:Clusters])),xgb_train_2[:,:price]);
xgb_lnr_2 = IAI.get_learner(xgb_grid_2)
R2 = IAI.score(xgb_lnr_2,select(xgb_train_2, Not([:price,:Clusters])),xgb_train_2[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [263]:
#Cluster 3
xgb_train_3 = copy(cluster_3)
xgb_grid_3 = xgb_training(select(xgb_train_3, Not([:price,:Clusters])),xgb_train_3[:,:price]);
xgb_lnr_3 = IAI.get_learner(xgb_grid_3)
R2 = IAI.score(xgb_lnr_3,select(xgb_train_3, Not([:price,:Clusters])),xgb_train_3[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [264]:
#Cluster 4
xgb_train_4 = copy(cluster_4)
xgb_grid_4 = xgb_training(select(xgb_train_4, Not([:price,:Clusters])),xgb_train_4[:,:price]);
xgb_lnr_4 = IAI.get_learner(xgb_grid_4)
R2 = IAI.score(xgb_lnr_4,select(xgb_train_4, Not([:price,:Clusters])),xgb_train_4[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [265]:
#Cluster 5
xgb_train_5 = copy(cluster_5)
xgb_grid_5 = xgb_training(select(xgb_train_5, Not([:price,:Clusters])),xgb_train_5[:,:price]);
xgb_lnr_5 = IAI.get_learner(xgb_grid_5)
R2 = IAI.score(xgb_lnr_5,select(xgb_train_5, Not([:price,:Clusters])),xgb_train_5[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [266]:
#Cluster 6
xgb_train_6 = copy(cluster_6)
xgb_grid_6 = xgb_training(select(xgb_train_6, Not([:price,:Clusters])),xgb_train_6[:,:price]);
xgb_lnr_6 = IAI.get_learner(xgb_grid_6)
R2 = IAI.score(xgb_lnr_6,select(xgb_train_6, Not([:price,:Clusters])),xgb_train_6[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [267]:
#Cluster 7
xgb_train_7 = copy(cluster_7)
xgb_grid_7 = xgb_training(select(xgb_train_7, Not([:price,:Clusters])),xgb_train_7[:,:price]);
xgb_lnr_7 = IAI.get_learner(xgb_grid_7)
R2 = IAI.score(xgb_lnr_7,select(xgb_train_7, Not([:price,:Clusters])),xgb_train_7[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [268]:
#Cluster 8
xgb_train_8 = copy(cluster_8)
xgb_grid_8 = xgb_training(select(xgb_train_8, Not([:price,:Clusters])),xgb_train_8[:,:price]);
xgb_lnr_8 = IAI.get_learner(xgb_grid_8)
R2 = IAI.score(xgb_lnr_8,select(xgb_train_8, Not([:price,:Clusters])),xgb_train_8[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [269]:
#Cluster 9
xgb_train_9 = copy(cluster_9)
xgb_grid_9 = xgb_training(select(xgb_train_9, Not([:price,:Clusters])),xgb_train_9[:,:price]);
xgb_lnr_9 = IAI.get_learner(xgb_grid_9)
R2 = IAI.score(xgb_lnr_9,select(xgb_train_9, Not([:price,:Clusters])),xgb_train_9[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [270]:
#Cluster 10
xgb_train_10 = copy(cluster_10)
xgb_grid_10 = xgb_training(select(xgb_train_10, Not([:price,:Clusters])),xgb_train_10[:,:price]);
xgb_lnr_10 = IAI.get_learner(xgb_grid_10)
R2 = IAI.score(xgb_lnr_10,select(xgb_train_10, Not([:price,:Clusters])),xgb_train_10[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [271]:
#Cluster 11
xgb_train_11 = copy(cluster_11)
xgb_grid_11 = xgb_training(select(xgb_train_11, Not([:price,:Clusters])),xgb_train_11[:,:price]);
xgb_lnr_11 = IAI.get_learner(xgb_grid_11)
R2 = IAI.score(xgb_lnr_11,select(xgb_train_11, Not([:price,:Clusters])),xgb_train_11[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [272]:
#Cluster 12
xgb_train_12 = copy(cluster_12)
xgb_grid_12 = xgb_training(select(xgb_train_12, Not([:price,:Clusters])),xgb_train_12[:,:price]);
xgb_lnr_12 = IAI.get_learner(xgb_grid_12)
R2 = IAI.score(xgb_lnr_12,select(xgb_train_12, Not([:price,:Clusters])),xgb_train_12[:,:price], criterion=:mse)
cluster_train_R2[i]=R2
i+=1;

In [273]:
#Cluster 13
xgb_train_13 = copy(cluster_13)
xgb_grid_13 = xgb_training(select(xgb_train_13, Not([:price,:Clusters])),xgb_train_13[:,:price]);
xgb_lnr_13 = IAI.get_learner(xgb_grid_13)
R2 = IAI.score(xgb_lnr_13,select(xgb_train_13, Not([:price,:Clusters])),xgb_train_13[:,:price], criterion=:mse)
cluster_train_R2[i]=R2;


In [274]:
#Viewing training R2
cluster_train_R2

13-element Vector{Float64}:
 0.4948000798711403
 0.6394512097129152
 0.2287582863147628
 0.3027928689369206
 0.5693012736913479
 0.6685306191625906
 0.37456247789119446
 0.2795041533197209
 0.2691698063999177
 0.5048811233409722
 0.2965799464994775
 0.34372920810174434
 0.5694313664606441

## Calculating centroids


To make predictions on test set, we first need to assign each test set to the cluster nearest to it and then apply the corresponding trained XGBoost model for the cluster on the test data.

In [254]:

#xgb_train_i: training data belonging to cluster i, with Clusters column , 15 columns

In [277]:
function get_centroid(cluster_data,i)
    centroid = zeros(size(cluster_data)[2])
    #centroid is average for numerical columns
    centroid = mean.(eachcol(cluster_data))
    #centroid is mode for categorical columns
    centroid[8:14].=0
    roomtype =[sum(cluster_data[:,8]),sum(cluster_data[:,9]),sum(cluster_data[:,10])]
    neigh_cat = [sum(cluster_data[:,11]),sum(cluster_data[:,12]),sum(cluster_data[:,13]),sum(cluster_data[:,14])]
    rtype = findmax(roomtype)[2]
    ncat =  findmax(neigh_cat)[2]   
    centroid[7+rtype]=1
    centroid[10+ncat]=1
    
    return centroid
end
    

get_centroid (generic function with 1 method)

In [278]:
names(xgb_train_1)

15-element Vector{String}:
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"
 "Clusters"
 "Pvt_room"
 "Hotel"
 "Shd_room"
 "High"
 "Mid_High"
 "Middle"
 "Mid_Low"

In [279]:
#creating a dataframe of centroids
centroid_df= zeros(k,size(xgb_train_1)[2]-1)
size(centroid_df)

(13, 14)

In [280]:
a=1;
I=size(centroid_df)[2]; #14

In [281]:
#Centroid 1
centroid = get_centroid(select(xgb_train_1,Not([:Clusters])),1)
for i=a:I
    centroid_df[1,i]=centroid[i]
end

In [282]:
#Centroid 2
centroid = get_centroid(select(xgb_train_2,Not([:Clusters])),2)
for i=a:I
    centroid_df[2,i]=centroid[i]
end

In [283]:
#Centroid 3
centroid = get_centroid(select(xgb_train_3,Not([:Clusters])),3)
for i=a:I
    centroid_df[3,i]=centroid[i]
end

In [284]:
#Centroid 4
centroid = get_centroid(select(xgb_train_4,Not([:Clusters])),4)
for i=a:I
    centroid_df[4,i]=centroid[i]
end

In [285]:
#Centroid 5
centroid = get_centroid(select(xgb_train_5,Not([:Clusters])),5)
for i=a:I
    centroid_df[5,i]=centroid[i]
end

In [286]:
#Centroid 6
centroid = get_centroid(select(xgb_train_6,Not([:Clusters])),6)
for i=a:I
    centroid_df[6,i]=centroid[i]
end

In [287]:
#Centroid 7
centroid = get_centroid(select(xgb_train_7,Not([:Clusters])),7)
for i=a:I
    centroid_df[7,i]=centroid[i]
end

In [288]:
#Centroid 8
centroid = get_centroid(select(xgb_train_8,Not([:Clusters])),8)
for i=a:I
    centroid_df[8,i]=centroid[i]
end

In [289]:
#Centroid 9
centroid = get_centroid(select(xgb_train_9,Not([:Clusters])),9)
for i=a:I
    centroid_df[9,i]=centroid[i]
end

In [290]:
#Centroid 10
centroid = get_centroid(select(xgb_train_10,Not([:Clusters])),10)
for i=a:I
    centroid_df[10,i]=centroid[i]
end

In [291]:
#Centroid 11
centroid = get_centroid(select(xgb_train_11,Not([:Clusters])),11)
for i=a:I
    centroid_df[11,i]=centroid[i]
end

In [292]:
#Centroid 12
centroid = get_centroid(select(xgb_train_12,Not([:Clusters])),12)
for i=a:I
    centroid_df[12,i]=centroid[i]
end

In [293]:
#Centroid 13
centroid = get_centroid(select(xgb_train_13,Not([:Clusters])),13)
for i=a:I
    if i==8
        centroid_df[13,i]= floor(centroid[i])
    else 
        centroid_df[13,i]=centroid[i]
    end
end

## Assigning each point in test set to nearest cluster

In [294]:
size(centroid_df)

(13, 14)

In [295]:
centroid_no_price = centroid_df[1:end, 1:end .!= 7];

In [296]:
size(centroid_no_price)

(13, 13)

In [297]:
names(xgb_test)

14-element Vector{String}:
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"
 "Pvt_room"
 "Hotel"
 "Shd_room"
 "High"
 "Mid_High"
 "Middle"
 "Mid_Low"

In [298]:
xgb_test_data=select(xgb_test,Not([:price]))
names(xgb_test_data)

13-element Vector{String}:
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "Pvt_room"
 "Hotel"
 "Shd_room"
 "High"
 "Mid_High"
 "Middle"
 "Mid_Low"

In [299]:
size(xgb_test_data)

(27036, 13)

In [300]:
n=size(xgb_test_data)[1]
m=size(xgb_test_data)[2]
test_cluster = zeros(n);

In [301]:
#Assigning nearest cluster to each point in test dataset
distance_mat = zeros(n,k)
dist=0

for i=1:n #data points
    for j=1:k #Clusters
        dist=0
        for p=1:m #features in test data
            dist = dist + (xgb_test_data[i,p]- centroid_no_price[j,p])^2
        end
        distance_mat[i,j] = dist/m
    end
    
    test_cluster[i]=findmin(distance_mat[i,:])[2]
end



In [302]:
test_cluster = Int64.(test_cluster);

In [303]:
xgb_test_cluster = xgb_test
xgb_test_cluster[!,:Clusters]= test_cluster;
names(xgb_test_cluster)

15-element Vector{String}:
 "minimum_nights"
 "number_of_reviews"
 "reviews_per_month"
 "calculated_host_listings_count"
 "availability_365"
 "days_since_last_review"
 "price"
 "Pvt_room"
 "Hotel"
 "Shd_room"
 "High"
 "Mid_High"
 "Middle"
 "Mid_Low"
 "Clusters"

In [304]:
#Segregating test data into clusters and evaluating R2 on each
i=1;
cluster_test_R2 = zeros(k);

In [306]:
function compute_R2_2(y, y_pred)
    
    y_bar=mean(y)
    SSE = sum((y_pred - y).^2)
    SST = sum((y .- y_bar).^2)
    return (1-(SSE/SST))
    
end


compute_R2_2 (generic function with 1 method)

In [307]:
#Cluster 1
xgb_test_1 = segregate_clusters(xgb_test_cluster,1)
xgb_test_1_price = IAI.predict(xgb_lnr_1,select(xgb_test_1,Not([:Clusters,:price])))
R2 = compute_R2_2(xgb_test_1[:,:price],xgb_test_1_price)
cluster_test_R2[i]=R2
i+=1;

In [308]:
R2

0.013392675716019764

In [309]:
#Cluster 2
xgb_test_2 = segregate_clusters(xgb_test_cluster,2)
xgb_test_2_price = IAI.predict(xgb_lnr_2,select(xgb_test_2,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_2[:,:price],xgb_test_2_price)
cluster_test_R2[i]=R2
i+=1;

In [310]:
#Cluster 3
xgb_test_3 = segregate_clusters(xgb_test_cluster,3)
xgb_test_3_price = IAI.predict(xgb_lnr_3,select(xgb_test_3,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_3[:,:price],xgb_test_3_price)
cluster_test_R2[i]=R2
i+=1;

In [311]:
#Cluster 4
xgb_test_4 = segregate_clusters(xgb_test_cluster,4)
xgb_test_4_price = IAI.predict(xgb_lnr_4,select(xgb_test_4,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_4[:,:price],xgb_test_4_price)
cluster_test_R2[i]=R2
i+=1;

In [312]:
#Cluster 5
xgb_test_5 = segregate_clusters(xgb_test_cluster,5)
xgb_test_5_price = IAI.predict(xgb_lnr_5,select(xgb_test_5,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_5[:,:price],xgb_test_5_price)
cluster_test_R2[i]=R2
i+=1;

In [313]:
#Cluster 6
xgb_test_6 = segregate_clusters(xgb_test_cluster,6)
xgb_test_6_price = IAI.predict(xgb_lnr_6,select(xgb_test_6,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_6[:,:price],xgb_test_6_price)
cluster_test_R2[i]=R2
i+=1;

In [314]:
#Cluster 7
xgb_test_7 = segregate_clusters(xgb_test_cluster,7)
xgb_test_7_price = IAI.predict(xgb_lnr_7,select(xgb_test_7,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_7[:,:price],xgb_test_7_price)
cluster_test_R2[i]=R2
i+=1;

In [315]:
#Cluster 8
xgb_test_8 = segregate_clusters(xgb_test_cluster,8)
xgb_test_8_price = IAI.predict(xgb_lnr_8,select(xgb_test_8,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_8[:,:price],xgb_test_8_price)
cluster_test_R2[i]=R2
i+=1;

In [316]:
#Cluster 9
xgb_test_9 = segregate_clusters(xgb_test_cluster,9)
xgb_test_9_price = IAI.predict(xgb_lnr_9,select(xgb_test_9,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_9[:,:price],xgb_test_9_price)
cluster_test_R2[i]=R2
i+=1;

In [317]:
#Cluster 10
xgb_test_10 = segregate_clusters(xgb_test_cluster,10)
xgb_test_10_price = IAI.predict(xgb_lnr_10,select(xgb_test_10,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_10[:,:price],xgb_test_10_price)
cluster_test_R2[i]=R2
i+=1;

In [318]:
#Cluster 11
xgb_test_11 = segregate_clusters(xgb_test_cluster,11)
xgb_test_11_price = IAI.predict(xgb_lnr_11,select(xgb_test_11,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_11[:,:price],xgb_test_11_price)
cluster_test_R2[i]=R2
i+=1;

In [319]:
#Cluster 12
xgb_test_12 = segregate_clusters(xgb_test_cluster,12)
xgb_test_12_price = IAI.predict(xgb_lnr_12,select(xgb_test_12,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_12[:,:price],xgb_test_12_price)
cluster_test_R2[i]=R2
i+=1;

In [320]:
#Cluster 13
xgb_test_13 = segregate_clusters(xgb_test_cluster,13)
xgb_test_13_price = IAI.predict(xgb_lnr_13,select(xgb_test_13,Not([:price,:Clusters])))
R2 = compute_R2_2(xgb_test_13[:,:price],xgb_test_13_price)
cluster_test_R2[i]=R2
i+=1;

In [321]:
#Cluster wise test set R2
cluster_test_R2

13-element Vector{Float64}:
  0.013392675716019764
 -0.17389332739783492
 -0.08539618279155303
 -0.02623372085358877
  0.15195689058137263
 -0.2990085736126067
  0.07175362146091069
 -0.08080225402439178
  0.06870905575500186
 -0.02867758650628316
 -0.3373816797515461
 -0.3104650269566325
 -0.3251057599472511