In [1]:
from lib import data_generation as dg
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import time

## 1. Data

In [2]:
data = pd.read_csv('input/processed_data_nyc.csv', index_col = 0)
data.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,all_year_avail,low_avail,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,5.010635,1,9,2762,0.21,6,True,False,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,5.420535,1,45,2976,0.38,2,True,False,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,5.01728,3,0,0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,4.49981,1,270,3021,4.64,1,False,False,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,4.394449,10,9,2793,0.1,1,False,True,...,0,0,0,0,0,0,0,1,0,0


In [3]:
X = np.asarray(data).astype(np.float32)

## 2. Generating New Data

In [4]:
newData = dg.SMOTE(X, 10, 2)

In [5]:
newData.shape

(48768, 240)

In [6]:
X.shape

(48768, 240)

In [7]:
mse = mean_squared_error(X, newData)
r2 = r2_score(X, newData)
mae = mean_absolute_error(X, newData)

We can test the eficacy of our data generation by looking at the Mean-Squared Error (MSE), Mean-Absolute Error (MAE), and R2 score. Based on these values below, we can conclude that SMOTE is effective for data generation in this case.

In [8]:
print("MSE = {} , R2 = {} , MAE = {}".format(mse,r2,mae))

MSE = 0.05238356860289287 , R2 = 0.5411800078836516 , MAE = 0.011673000430802582


### 2.1 Testing different neighbors
We test the MSE, MAE, and R2 of SMOTE performed on varying number of neighbors to find the ideal value for this parameter.

In [9]:
K_VALUES = [2,5,10,20,50,100,500]
mse_final, mae_final, r2_final, time_final = [], [], [], []

for k in K_VALUES:
    print("K value = {}".format(k))
    
    time1 = time.time()
    newData = dg.SMOTE(X, 10, k)
    time2 = time.time()
    
    mse = mean_squared_error(X, newData)
    r2 = r2_score(X, newData)
    mae = mean_absolute_error(X, newData)
    
    print("{} NEIGHBORS:   MSE = {} , R2 = {} , MAE = {} , TIME = {}".format(k,mse,r2,mae, time2-time1))
    time_final.append(time2-time1)
    mse_final.append(mse)
    mae_final.append(mae)
    r2_final.append(r2)

K value = 2
2 NEIGHBORS:   MSE = 0.08279696124358636 , R2 = 0.5255411067890734 , MAE = 0.011713642304607023 , TIME = 34.20147657394409
K value = 5
5 NEIGHBORS:   MSE = 0.12311483532973032 , R2 = 0.48837568467402953 , MAE = 0.014837647621124854 , TIME = 38.014232873916626
K value = 10
10 NEIGHBORS:   MSE = 0.12874681435420393 , R2 = 0.4605580609674046 , MAE = 0.01725334283321457 , TIME = 40.86274075508118
K value = 20
20 NEIGHBORS:   MSE = 0.24593446051024756 , R2 = 0.4164068974059054 , MAE = 0.020592160992420352 , TIME = 44.444109201431274
K value = 50
50 NEIGHBORS:   MSE = 0.4556967589849847 , R2 = 0.4115256471634011 , MAE = 0.025967329954145423 , TIME = 61.67448449134827
K value = 100
100 NEIGHBORS:   MSE = 0.645009834818471 , R2 = 0.40552103468205236 , MAE = 0.031080037540608767 , TIME = 56.05508589744568
K value = 500
500 NEIGHBORS:   MSE = 2.136651753433136 , R2 = 0.38584294495656896 , MAE = 0.053705073855490316 , TIME = 91.17248511314392


In [10]:
results_df = pd.DataFrame({
    'K value': K_VALUES,
    'MSE': mse_final,
    'MAE': mae_final,
    'R2': r2_final,
    'Time': time_final
})

results_df.to_csv('output/results_dataGen.csv', index = False, header=True)