In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import time
from sklearn.preprocessing import RobustScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.model_selection import KFold

from lib import data_generation as dg
from lib import feature_processing as fp

## 1. Data
### 1.0 Loading Data
Here we load the raw data

In [2]:
data = pd.read_csv('input/AB_NYC_2019.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


### 1.1 Processing Data
The data is processed as explained in the data processing notebook of this project. The process function in feature_processing library is written based on that notebook.

In [3]:
processed_data = fp.process(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [4]:
processed_data.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,all_year_avail,low_avail,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,5.010635,1,9,2762,0.21,6,True,False,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,5.420535,1,45,2976,0.38,2,True,False,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,5.01728,3,0,0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,4.49981,1,270,3021,4.64,1,False,False,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,4.394449,10,9,2793,0.1,1,False,True,...,0,0,0,0,0,0,0,1,0,0


### 1.2 Split Data
We split the data 80-20 into training and testing data

In [5]:
y = processed_data.price
processed_data = processed_data.drop(['price'], axis=1)

X = np.asarray(processed_data)
y = np.asarray(y).ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Dataset: {}".format(X_train.shape))
print("Testing Dataset: {}".format(X_test.shape))

Training Dataset: (39014, 239)
Testing Dataset: (9754, 239)


### 1.3 Scaling Data
The final step in data processing is using the robust scaler to scale all data! But we will apply the scaler after new data is generated!

In [6]:
scaler = RobustScaler()

## 2. ML Models
In this project we tested multiple regressors at varying parameters. We concluded that the ideal model parameters were as follows for the top 3 regressors:

1) Random Forest: Number of estimators = 50  
2) Ridge Regression: Alpha = 5  
3) Huber Regression: Alpha = 10, Epsilon = 3  

As such, we will define our final models:

In [7]:
randomForest_final = RandomForestRegressor(n_estimators=50)
ridge_final = Ridge(alpha=5)
huber_final = HuberRegressor(alpha=10, epsilon=3)

In [8]:
from lib import models

nn_model = models.buildNN(X_train)

### 2.1 Unit Testing
We perform unit testing to check which samples perform the worst.

In [9]:
def unitTest(model, sampleX, sampleY):
    y_pred = model.predict(sampleX)
    return abs(sampleY - y_pred)

In [10]:
def getMinority(model, X, y, numFolds = 5, percent = 0.1):
    # Split training data into chunks
    cv = KFold(numFolds, True, 1)

    round = 1
    
    numSamples = int( (percent * X.shape[0]) / numFolds)
    
    
    # We test the model on each chunk after training on the other chunks
    for train, test in cv.split(X):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

        # Scale data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model.fit(X_train_scaled, y_train)
        
        diff_list = []
        for i in range(X_test_scaled.shape[0]):
            sampleX = X_test_scaled[i]
            sampleY = y_test[i]
            
            temp = unitTest(model, sampleX.reshape(1,-1), sampleY.reshape(-1))
            diff_list.append(temp)
            
        # Get poorest performance test values (highest mse)
        highest_diff = sorted(range(len(diff_list)), key=lambda i: diff_list[i])[-numSamples:]
        
        
        # We want to add unscaled data to minority so it returns unscaled
        if (round==1):
            X_minority = X_test[highest_diff]
            y_minority = y_test[highest_diff]
        else:
            X_minority = np.concatenate( (X_minority, X_test[highest_diff]), axis = 0)
            y_minority = np.concatenate( (y_minority, y_test[highest_diff]), axis = 0)
            
        round = round + 1
 
    
    return X_minority, y_minority

In [11]:
# X_m, y_m = getMinority(randomForest_final, X_train, y_train, percent=0.01)
# print("Minority data acquired! X-shape = {}  ,  y-shape = {} ".format(X_m.shape, y_m.shape))

## 3. Generating Additional Data
We use the Synthetic Minority Oversampling Technique (SMOTE) to generate additional data from the training set.

In [12]:
# minority_data = np.concatenate( (X_m, y_m.reshape(-1,1)), axis = 1)
# print("Minority Data Shape: {}".format(minority_data.shape))

In [13]:
def genData(minority_data, N, k=2):
    newData = dg.SMOTE(minority_data, 100, 2)

    mse = mean_squared_error(minority_data, newData)
    r2 = r2_score(minority_data, newData)
    mae = mean_absolute_error(minority_data, newData)

#     print("Round 1/{}".format(N))
#     print("New data generated with MSE = {} , R2 = {} , MAE = {}".format(mse,r2,mae))
#     print("Size of training data = {}".format(minority_data.shape))
#     print("Size of new data = {}".format(newData.shape))

    for n in range(N-1):
        temp = dg.SMOTE(minority_data, 100, k)

        mse = mean_squared_error(minority_data, temp)
        r2 = r2_score(minority_data, temp)
        mae = mean_absolute_error(minority_data, temp)

        newData = np.concatenate((newData, temp), axis = 0)

#         print("Round {}/{}".format(n+2,N))
#         print("New data generated with MSE = {} , R2 = {} , MAE = {}".format(mse,r2,mae))
#         print("Size of training data = {}".format(minority_data.shape))
#         print("Size of new data = {}".format(newData.shape))
        
    return newData

In [14]:
# newData = genData(minority_data, 20, k= 32)

As we can see from above, we created a new dataset that is similar to the minority set!

In [15]:
def combineData(X_train, y_train, newData):
#     print("Train data before new data: ")
#     print("Training Dataset: {}".format(X_train.shape))
#     print("Training Labels: {}".format(y_train.shape))

    y_m = newData[:,-1]
    X_m = np.delete(newData, np.s_[-1:], axis=1)
#     print("X_m shape = {}".format(X_m.shape))
#     print("y_m shape = {}".format(y_m.shape))

    combined_X_train = np.concatenate((X_train, X_m), axis = 0)
    combined_y_train = np.concatenate((y_train, y_m), axis = 0)

#     print("Train data after new data is added: ")
#     print("Training Dataset: {}".format(X_train.shape))
#     print("Training Labels: {}".format(y_train.shape))
    
    return combined_X_train, combined_y_train

In [16]:
# X_train_new, y_train_new = combineData(X_train, y_train, newData)

## 4. ML Models With New Data

### 4.2 Creating Test Function

In [17]:
def testModel( model, X_train, y_train, X_test, y_test):
    # Scale data first
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
        
    model.fit(X_train_scaled,y_train)
    out_y = model.predict(X_test_scaled)
    
    result = mean_squared_error(y_test, out_y)
    print("Result: {}".format(result))
    
    return result

In [18]:
# result = testModel( nn_model , X_train_new, y_train_new, X_test, y_test)

The results are better with more minority data in the training set!

### 4.3 Testing data generation with different parameters
Previously, we generated 3 similar sets to the 10% minority (10% of the worst performance samples in the training set). Now we are going to test the same thing but with different percet minority and different number of similar sets (as opposed to 10% and 3 respectively).

In [19]:
# percentages = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
# N_values = [1,2,3,5]
# K_values = [2,4,8,16]
# model = randomForest_final

# mse_final = []
# p_final = []
# n_final = []
# k_final = []

# for p in percentages:
#     for n in N_values:
#         for k in K_values:
#             print("P = {} , N = {} , K = {}".format(p,n,k))
#             X_m, y_m = getMinority(model, X_train, y_train, percent = p)
#             minority_data = np.concatenate( (X_m, y_m.reshape(-1,1)), axis = 1)
#             newData = genData(minority_data, n, k=k)
#             X_train_new, y_train_new = combineData(X_train, y_train, newData)


#             rf_result = testModel( randomForest_final , X_train_new, y_train_new, X_test, y_test)
#             p_final.append(p)
#             n_final.append(n)
#             k_final.append(k)
#             mse_final.append(rf_result)

In [20]:
# finalResult_df = pd.DataFrame({
#     'Percentage': p_final,
#     'Num of extra sets': n_final,
#     'Neighbors': k_final,
#     'MSE': mse_final,
# })

# finalResult_df.to_csv('output/results_dataGen_pnk_nn.csv', index = False, header=True)

In [21]:
# finalResult_df = finalResult_df.sort_values('MSE')

In [22]:
# finalResult_df.head(10)

### 4.4 Cross validating with new data
Here we will cross validate our ML mode, a Random Forest Regressor, with varying number of estimators (the main parameter of this model)

In [23]:
# P_VALUES = [0.00001, 0.00005 ,0.0001, 0.0005, 0.001, 0.01]
# # NUM_ESTIMATORS = [10, 50, 100, 200]
# N_VALUES = [1,2,3]

# n_final = []
# estimators_final = []
# mse_final = []
# p_final = []

# for p in P_VALUES:
#     for n in N_VALUES:
#         model = RandomForestRegressor(n_estimators=50)

#         X_m, y_m = getMinority(model, X_train, y_train, percent = p)
#         minority_data = np.concatenate( (X_m, y_m.reshape(-1,1)), axis = 1)
#         newData = genData(minority_data, n)
#         X_train_new, y_train_new = combineData(X_train, y_train, newData)

#         X_train_scaled = scaler.fit_transform(X_train_new)
#         X_test_scaled = scaler.transform(X_test)

#         model.fit(X_train_scaled,y_train_new)
#         out_y1 = model.predict(X_test_scaled)
#         mse = mean_squared_error(y_test, out_y1)
#         print("N = {} , MSE = {}".format(n, mse))

#         mse_final.append(mse)
#         n_final.append(n)
#         p_final.append(p)



N = 1 , MSE = 0.17399852388525613




N = 2 , MSE = 0.17666396908206664




N = 3 , MSE = 0.17814632948167702




N = 5 , MSE = 0.18196545935213781




N = 1 , MSE = 0.1744761397771328




N = 2 , MSE = 0.1766864093994036




N = 3 , MSE = 0.17856671387313133




N = 5 , MSE = 0.1819717142366814




N = 1 , MSE = 0.1746616426712798




KeyboardInterrupt: 

In [None]:
# df = pd.DataFrame({
#     'Estimators': n_final,
#     'Percent': p_final,
#     'MSE': mse_final,
# })

# df.to_csv('output/results_rf_cv_SMOTE.csv', index = False, header=True)

## 5. Final Model

Here we will assemble everything to make our final model. We will generate data, train our model, test it, and show the results.

In [None]:
model = RandomForestRegressor(n_estimators=50)
p = 0.005
n = 2

X_m, y_m = getMinority(model, X_train, y_train, percent = p)
minority_data = np.concatenate( (X_m, y_m.reshape(-1,1)), axis = 1)
newData = genData(minority_data, n)
X_train_new, y_train_new = combineData(X_train, y_train, newData)

X_train_scaled = scaler.fit_transform(X_train_new)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled,y_train_new)

out_y1 = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, out_y1)
print("MSE = {}".format(mse))



Now we compare the above results to the same model but without generating new data!

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled,y_train)

out_y1 = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, out_y1)
print("MSE = {}".format(mse))