In [1]:
from lib import data_generation as dg
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import time

## 1. Data

In [2]:
data = pd.read_csv('input/processed_data_nyc.csv', index_col = 0)
data.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,all_year_avail,low_avail,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,5.010635,1,9,2762,0.21,6,True,False,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,5.420535,1,45,2976,0.38,2,True,False,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,5.01728,3,0,0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,4.49981,1,270,3021,4.64,1,False,False,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,4.394449,10,9,2793,0.1,1,False,True,...,0,0,0,0,0,0,0,1,0,0


In [3]:
y = data.price
data = data.drop(['price'], axis=1)

X = np.asarray(data).astype(np.float32)
y = np.asarray(y).ravel()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Dataset: {}".format(X_train.shape))
print("Testing Dataset: {}".format(X_test.shape))

Training Dataset: (39014, 239)
Testing Dataset: (9754, 239)


## 2. Generating New Data

In [5]:
newData = dg.SMOTE(X_train, 10, 2)

In [6]:
newData.shape

(39014, 239)

In [7]:
X_train.shape

(39014, 239)

In [8]:
mse = mean_squared_error(X_train, newData)
r2 = r2_score(X_train, newData)
mae = mean_absolute_error(X_train, newData)

We can test the eficacy of our data generation by looking at the Mean-Squared Error (MSE), Mean-Absolute Error (MAE), and R2 score. Based on these values below, we can conclude that SMOTE is effective for data generation in this case.

In [9]:
print("MSE = {} , R2 = {} , MAE = {}".format(mse,r2,mae))

MSE = 0.05417200204154522 , R2 = 0.5278008263371884 , MAE = 0.011376256305893532


### 2.1 Testing different neighbors
We test the MSE, MAE, and R2 of SMOTE performed on varying number of neighbors to find the ideal value for this parameter.

In [10]:
K_VALUES = [2,5,10,20,50,100,500]
mse_final, mae_final, r2_final, time_final = [], [], [], []

for k in K_VALUES:
    print("K value = {}".format(k))
    
    time1 = time.time()
    newData = dg.SMOTE(X_train, 10, k)
    time2 = time.time()
    
    mse = mean_squared_error(X_train, newData)
    r2 = r2_score(X_train, newData)
    mae = mean_absolute_error(X_train, newData)
    
    print("{} NEIGHBORS:   MSE = {} , R2 = {} , MAE = {} , TIME = {}".format(k,mse,r2,mae, time2-time1))
    time_final.append(time2-time1)
    mse_final.append(mse)
    mae_final.append(mae)
    r2_final.append(r2)

K value = 2
2 NEIGHBORS:   MSE = 0.07284620260397871 , R2 = 0.5243498028269686 , MAE = 0.011478812130499493 , TIME = 30.797602891921997
K value = 5
5 NEIGHBORS:   MSE = 0.16490514105597112 , R2 = 0.479286713035328 , MAE = 0.01456963516293914 , TIME = 33.81038427352905
K value = 10
10 NEIGHBORS:   MSE = 0.1992074738638193 , R2 = 0.4562618966628654 , MAE = 0.017554076850472236 , TIME = 36.73320388793945
K value = 20
20 NEIGHBORS:   MSE = 0.23847795640794267 , R2 = 0.445788637757811 , MAE = 0.02071181933573219 , TIME = 39.570953130722046
K value = 50
50 NEIGHBORS:   MSE = 0.4507133177310279 , R2 = 0.4125399514967037 , MAE = 0.026514211997243214 , TIME = 44.598819732666016
K value = 100
100 NEIGHBORS:   MSE = 0.7514523944054016 , R2 = 0.4060930324504034 , MAE = 0.03219299347480549 , TIME = 49.80815529823303
K value = 500
500 NEIGHBORS:   MSE = 2.792690376450135 , R2 = 0.37991967319233616 , MAE = 0.05727190547830239 , TIME = 77.48762488365173


In [11]:
results_df = pd.DataFrame({
    'K value': K_VALUES,
    'MSE': mse_final,
    'MAE': mae_final,
    'R2': r2_final,
    'Time': time_final
})

results_df.to_csv('output/results_dataGen.csv', index = False, header=True)