# K-nearest predictions

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
import utils as u
from scipy import stats
import copy

import matplotlib.pylab as plt
plt.style.use('ggplot')

def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    # assert (y_true >= 0).all() 
    # assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

%run utils.py
%run ../common_utils.py

In [46]:
metaData = u.getAllMetadata()
train = u.getAllTrainData()
test = u.getAllTestData()

In [47]:
remove_zero = [row["area_living"] if row["area_living"] >= 1 else row["area_total"]*(train["area_living"].mean() / train["area_total"].mean()) for _,row in train.iterrows()]
train["area_living"] = remove_zero

###### MISSING VALUES
# area_living
train = u.fillnaReg(train, ['area_total'], 'area_living')

# area_kitchen
train = u.fillnaReg(train, ['area_total', 'area_living'], 'area_kitchen')

# ceiling
train = u.fillnaReg(train, ['stories'], 'ceiling')

# train['area_total'] = np.log(train['area_total'])
# train['area_living'] = np.log(train['area_living'])
# train['area_kitchen'] = np.log(train['area_kitchen'])

In [48]:
############## SETTINGS
test_size = 0.1
random_state_split = 42
features = ["area_total", "ceiling", "r", "theta", "floor", "condition", "bathrooms_shared", "parking", "stories", "rooms", "windows_street", "balconies", "elevator_without", "heating", "district", 'building_id']


In [49]:
train = train[(np.abs(stats.zscore(train['price'])) < 1)]

train.fillna(train.mean(), inplace = True)
test.fillna(test.mean(), inplace = True)

train, test = polar_coordinates(train, test)

values = u.oneHotFeature(metaData, train, 'district')
# features = features + values

train_data, test_data = train_test_split(train, test_size=test_size, random_state=random_state_split, stratify=np.log(train.price).round())

  train.fillna(train.mean(), inplace = True)
  test.fillna(test.mean(), inplace = True)


In [50]:
X_train, y_train = train_data[features], train_data['price']
X_test, y_test = test_data[features], test_data['price']

In [51]:
# def runKNearest(X_train, y_train, X_test, y_test, 
#     n_neighbors = 20, weights='uniform', algorithm='ball_tree', leaf_size=30, p=1, metric='minkowski', metric_params=None, n_jobs=None):
#     model = KNeighborsRegressor(
#         n_neighbors=n_neighbors, 
#         weights=weights, 
#         algorithm=algorithm, 
#         leaf_size=leaf_size, 
#         p=p, 
#         metric=metric, 
#         metric_params=metric_params, 
#         n_jobs=n_jobs
#     )
#     model.fit(X_train, y_train)
#     rf_prediction = model.predict(X_test)
#     rf_msle = root_mean_squared_log_error(rf_prediction, y_test)
#     return rf_msle

# def optimizeParam(X_train, y_train, X_test, y_test, variable, start, end, step):
#     x = []
#     y = []
#     for value in range(start, end, step):
#         x.append(value)
#         y.append(runKNearest(X_train, y_train, X_test, y_test, p=value))
#     plt.plot(x, y)
#     plt.xlabel('p parameter values')
#     plt.ylabel('RMLSE')

# def optimizeParam(X_train, y_train, X_test, y_test, variable, values):
#     x = []
#     y = []
#     for value in values:
#         x.append(value)
#         y.append(runKNearest(X_train, y_train, X_test, y_test, algorithm=value))
#     plt.plot(x, y)
#     plt.xlabel('algorithm parameter values')
#     plt.ylabel('RMLSE')

# optimizeParam(X_train, y_train, X_test, y_test, 'algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])

In [52]:
nonCategorical, categorical = u.getNonCategoricalAndCategoricalFeatures(metaData)
nonCategorical = list(set(nonCategorical).intersection(features))
categorical = list(set(categorical).intersection(features))

# Only normalize/scale the numerical data. Categorical data is kept as is.
X_train_nonCategorical = X_train[nonCategorical]
X_test_nonCategorical = X_test[nonCategorical]

std_scale = preprocessing.StandardScaler().fit(X_train_nonCategorical)
train_labels_scaled = std_scale.transform(X_train_nonCategorical)
test_labels_scaled = std_scale.transform(X_test_nonCategorical)

training_norm_col = pd.DataFrame(train_labels_scaled, index=X_train_nonCategorical.index, columns=X_train_nonCategorical.columns) 
X_train.update(training_norm_col)

testing_norm_col = pd.DataFrame(test_labels_scaled, index=X_test_nonCategorical.index, columns=X_test_nonCategorical.columns) 
X_test.update(testing_norm_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [53]:
model = KNeighborsRegressor(
    n_neighbors=20, 
    weights='uniform', 
    algorithm='ball_tree', 
    leaf_size=30, 
    p=1, 
    metric='minkowski', 
    metric_params=None, 
    n_jobs=None
)

In [54]:
# model.fit(X_train, y_train)

# features_temp = copy.deepcopy(features)
# features_temp.append('price')
# score= u.KFoldValidation(model, 5, root_mean_squared_log_error, train_data[features_temp])
# print('Test RMSLE:', score)

scores, average, best_model, best_index = KNN_groupKFold(5, model, train[features], train['price'])
print(scores, average)

[0.6829584938161837, 0.6882933848716933, 0.7135115193278416, 0.7261623939251126, 0.6291462561180032] 0.6880144096117669


In [55]:
# rf_prediction = model.predict(X_test)
# rf_mse = root_mean_squared_log_error(rf_prediction, y_test)
# print('Test MSE:', rf_mse)

In [56]:
# plt.scatter(prediction, y_test, alpha=0.2)
# plt.xlabel('Random Forest prediction')
# plt.ylabel('Ground Truth')

In [57]:
# ########## Create submission
# import json
# import utils
# # Fit model to the full dataset 

# # # ##### NEW POSITION
# # test_data_mod = test.copy()
# # test_data_mod["latitude"] = test_data_mod["latitude"] -  test_data_mod["latitude"].mean()
# # test_data_mod["longitude"] = test_data_mod["longitude"] -  test_data_mod["longitude"].mean()

# # # Convert to polar coordinates
# # test_data_mod["r"] =  np.sqrt(test_data_mod["latitude"]**2 + test_data_mod["longitude"]**2)
# # test_data_mod["theta"] = np.arctan(test_data_mod["longitude"]/test_data_mod["latitude"])
# # test = test_data_mod
# # test.columns


# ###### MISSING VALUES
# # area_living
# test = utils.fillnaReg(test, ['area_total'], 'area_living')
# # area_kitchen
# test = utils.fillnaReg(test, ['area_total', 'area_living'], 'area_kitchen')
# # ceiling
# test = utils.fillnaReg(test, ['stories'], 'ceiling')
# # # district
# # test = utils.fillnaReg(test, ['r', 'theta'], 'district')


# test.fillna(test.mean(), inplace = True)
# u.oneHotFeature(metaData, test, 'district')


# ############## Balacing prices
# # train = train.fillna(train.mean())
# # test = test.fillna(test.mean())
# # train = train[(np.abs(stats.zscore(train['price'])) < outliers_value)]

# # lower_quantile = 0.01
# # upper_quantile = 0.95
# # train_data_mid = train[(train['price']>=train['price'].quantile(lower_quantile)) & (train['price']<=train['price'].quantile(upper_quantile))]
# # train_data_outside = train[(train['price']<train['price'].quantile(lower_quantile)) | (train['price']>train['price'].quantile(upper_quantile))]

# # train_data_mid = train_data_mid.head(int(train_data_mid.shape[0] * 0.15))
# # train_data_temp = pd.concat([train_data_mid, train_data_outside])

# # train = train_data_temp


# X_train = train[features]
# y_train = train['price']
# print(f'Num nans in train {X_train.isna().any(axis=1).sum()}')
# model = KNeighborsRegressor(
#     n_neighbors=20, 
#     weights='uniform', 
#     algorithm='ball_tree', 
#     leaf_size=30, 
#     p=1, 
#     metric='minkowski', 
#     metric_params=None, 
#     n_jobs=None
# )
# model.fit(X_train, y_train)

# # Generate predictions for test set 
# X_test = test[features]
# X_test_nan = X_test.isna().any(axis=1)
# print(f'Num nans in test: {X_test_nan.sum()}')
# y_test_hat = model.predict(X_test[~X_test_nan])

# # Construct submission dataframe
# submission = pd.DataFrame()
# submission['id'] = test.id
# submission.loc[~X_test_nan, 'price_prediction'] = y_test_hat # Predict on non-nan entries
# submission['price_prediction'].fillna(y_train.mean(), inplace=True) # Fill missing entries with mean predictor
# print(f'Generated {len(submission)} predictions')

# # Export submission to csv with headers

# ############################################ CHANGE NAME !!!!!!!!!!
# submission.to_csv('sample_KNN_3.csv', index=False)
# settings = {
#     'train_MSE': score,
#     'features':features,
#     'nan_delete':'mean',
#     'test_size' : test_size,
#     'random_state_split' :  random_state_split,
#     'n_neighbors':20, 
#     'weights':'uniform', 
#     'algorithm':'ball_tree', 
#     'leaf_size':30, 
#     'p':1, 
#     'metric':'minkowski', 
#     'metric_params':None, 
#     'n_jobs':None,
#     'other':{
#         'Missing values' : {0:'Adding more trees!',
#             "Missing area_living": "reg on area_total", 
#             "Missing area_kitchen": "reg on area_total and area_living", 
#             "Missing ceiling": "reg on stories"
#         }    
#     }
# }

# with open('sample_KNN_3_settings.json', 'w') as fp:
#     json.dump(settings, fp)