In [1]:
# mount your google drive and follow instructions 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# run this cell directly if the file is stored in your drive
# otherwise, rewrite the directory and/or file name
!cp /content/drive/MyDrive/0403.zip 0403.zip
!unzip /content/drive/MyDrive/0403.zip

Archive:  /content/drive/MyDrive/0403.zip
  inflating: 0403/5cd56b6ee2acfd2d33b5a247_test.csv  
  inflating: 0403/5cd56b7de2acfd2d33b5c14b_train.csv  
  inflating: 0403/5cd56b6ae2acfd2d33b59c90_test.csv  
  inflating: 0403/5cd56b7de2acfd2d33b5c14b_test.csv  
  inflating: 0403/5cd56b6ee2acfd2d33b5a247_train.csv  
  inflating: 0403/5cd56b6ae2acfd2d33b59c90_train.csv  
  inflating: 0403/5cd56b6fe2acfd2d33b5a386_test.csv  
  inflating: 0403/5cd56b6ae2acfd2d33b59ccc_test.csv  
  inflating: 0403/5cd56b5ae2acfd2d33b58546_train.csv  
  inflating: 0403/5cd56b5ae2acfd2d33b58549_test.csv  
  inflating: 0403/5cd56b5ae2acfd2d33b58549_train.csv  
  inflating: 0403/5cd56b6fe2acfd2d33b5a386_train.csv  
  inflating: 0403/5cd56b6ae2acfd2d33b59ccc_train.csv  
  inflating: 0403/5cd56b5ae2acfd2d33b58546_test.csv  
  inflating: 0403/5da1383b4db8ce0c98bc11ab_test.csv  
  inflating: 0403/5d27099f03f801723c32511d_test.csv  
  inflating: 0403/5d27096c03f801723c31e5e0_test.csv  
  inflating: 0403/5d27097f03f8017

In [10]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import glob

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import psutil
import random
import os
import time
import sys
import math
from contextlib import contextmanager

# set random seed
SEED = 42

@contextmanager
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

# MSE - evaluation metric   
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

set_seed(SEED)

# read training and test files 
feature_dir = '/content/0403/'
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))

# Baseline 1: Ridge Regression

In [13]:
for n_files, file in enumerate(train_files):
  # read train and test set 
  data = pd.read_csv(file, index_col=0)
  test_data = pd.read_csv(test_files[n_files], index_col=0)

  # wifi, x, y, and f for training 
  wifi = data.iloc[:,:-5]
  x = data.iloc[:,-5]
  y =data.iloc[:,-4]
  floor = data.iloc[:,-3]

  # wifi, x, y, and f for testing
  wifi_val = test_data.iloc[:,:-5]
  x_val = test_data.iloc[:, -5]
  y_val =test_data.iloc[:, -4]
  floor_val = test_data.iloc[:, -3]

  # fit and predict x
  ridge_model_x = Ridge(alpha=100, fit_intercept=True)
  ridge_model_x.fit(wifi, x)
  x_pred = ridge_model_x.predict(wifi_val)

  # fit and predict y
  ridge_model_y = Ridge(alpha=100, fit_intercept=True)
  ridge_model_y.fit(wifi, y)
  y_pred = ridge_model_y.predict(wifi_val)

  #fit and predict f
  ridge_model_f = Ridge(alpha=100, fit_intercept=True)
  ridge_model_f.fit(wifi, floor)
  f_pred = ridge_model_f.predict(wifi_val)

  column_num = len(test_data.columns)
  test_data = test_data.reset_index()

  # insert columns for predicted x, y and f in test file
  test_data.insert(column_num, 'pred_x', x_pred, True)
  test_data.insert(column_num+1, 'pred_y', y_pred, True)
  test_data.insert(column_num+2, 'pred_f', f_pred, True)

  building_name = file[15:-10] + '_ridgebaseline.csv'

  # save new test file
  test_data.to_csv(building_name)


In [14]:
pred_files = glob.glob('/content' + '/*_ridgebaseline.csv')

In [15]:
# extract all true and predicted x, y and f
all_pred = pd.DataFrame()

for file_name in pred_files:
  file = pd.read_csv(file_name)

  all_pred = pd.concat([all_pred, file.iloc[:, -8:]])

In [16]:
# compute overall MSE
comp_metric(all_pred['pred_x'], all_pred['pred_y'], all_pred['pred_f'], all_pred['x'], all_pred['y'], all_pred['f'])

18.151951182164154

# Baseline 2

In [18]:

from sklearn import linear_model

for n_files, file in enumerate(train_files):
  data = pd.read_csv(file, index_col=0)
  test_data = pd.read_csv(test_files[n_files], index_col=0)

  # wifi, x, y and f for training 
  wifi = data.iloc[:,:-5]
  x = data.iloc[:,-5]
  y =data.iloc[:,-4]
  floor = data.iloc[:,-3]

  # wifi, x, y, and f for testing
  wifi_val = test_data.iloc[:,:-5]
  x_val = test_data.iloc[:, -5]
  y_val =test_data.iloc[:, -4]
  floor_val = test_data.iloc[:, -3]

  # initiate three lasso models 
  clf_x = linear_model.Lasso(alpha=0.1, max_iter = 10000)
  clf_y = linear_model.Lasso(alpha=0.1, max_iter = 10000)
  clf_f = linear_model.Lasso(alpha=0.1, max_iter = 10000)

  clf_x.fit(wifi, x)
  clf_y.fit(wifi, y)
  clf_f.fit(wifi, floor)

  x_pred = clf_x.predict(wifi_val)
  y_pred = clf_y.predict(wifi_val)
  f_pred = clf_f.predict(wifi_val)

  column_num = len(test_data.columns)
  test_data = test_data.reset_index()

  # insert columns for predicted x, y and f in test file
  test_data.insert(column_num, 'pred_x', x_pred, True)
  test_data.insert(column_num+1, 'pred_y', y_pred, True)
  test_data.insert(column_num+2, 'pred_f', f_pred, True)

  building_name = file[15:-10] + '_clf.csv'

  test_data.to_csv(building_name)



  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


In [19]:
pred_files = glob.glob('/content' + '/*_clf.csv')

In [20]:
# extract all true and predicted x, y and f
all_pred = pd.DataFrame()

for file_name in pred_files:
  file = pd.read_csv(file_name)

  all_pred = pd.concat([all_pred, file.iloc[:, -8:]])

In [21]:
# compute overall MSE
comp_metric(all_pred['pred_x'], all_pred['pred_y'], all_pred['pred_f'], all_pred['x'], all_pred['y'], all_pred['f'])

18.088156923209702