In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!nvidia-smi
!cat /proc/cpuinfo
!rm -rf sample_data

# !git clone --recursive https://github.com/Microsoft/LightGBM.git
# !cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2299.998
cache size	: 46080 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds 

In [4]:
!cp /content/drive/MyDrive/0403.zip 0403.zip
!unzip /content/drive/MyDrive/0403.zip

Archive:  /content/drive/MyDrive/0403.zip
replace 0403/5cd56b6be2acfd2d33b59d1f_test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import glob

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import psutil
import random
import os
import time
import sys
import math
from contextlib import contextmanager


N_SPLITS = 10
SEED = 42

@contextmanager
def timer(name: str):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    try:
        yield
    finally:
        m1 = p.memory_info()[0] / 2. ** 30
        delta = m1 - m0
        sign = '+' if delta >= 0 else '-'
        delta = math.fabs(delta)
        print(f"[{m1:.1f}GB({sign}{delta:.1f}GB): {time.time() - t0:.3f}sec] {name}", file=sys.stderr)


def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

set_seed(SEED)

feature_dir = './*/'
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))

['./0403/5cd56b5ae2acfd2d33b58544_train.csv', './0403/5cd56b5ae2acfd2d33b58546_train.csv', './0403/5cd56b5ae2acfd2d33b58548_train.csv', './0403/5cd56b5ae2acfd2d33b58549_train.csv', './0403/5cd56b5ae2acfd2d33b5854a_train.csv', './0403/5cd56b6ae2acfd2d33b59c90_train.csv', './0403/5cd56b6ae2acfd2d33b59ccb_train.csv', './0403/5cd56b6ae2acfd2d33b59ccc_train.csv', './0403/5cd56b6be2acfd2d33b59d1f_train.csv', './0403/5cd56b6ee2acfd2d33b5a247_train.csv', './0403/5cd56b6fe2acfd2d33b5a386_train.csv', './0403/5cd56b7de2acfd2d33b5c14b_train.csv', './0403/5d27075f03f801723c2e360f_train.csv', './0403/5d27096c03f801723c31e5e0_train.csv', './0403/5d27097f03f801723c320d97_train.csv', './0403/5d27099f03f801723c32511d_train.csv', './0403/5d2709a003f801723c3251bf_train.csv', './0403/5d2709b303f801723c327472_train.csv', './0403/5d2709bb03f801723c32852c_train.csv', './0403/5d2709c303f801723c3299ee_train.csv', './0403/5d2709d403f801723c32bd39_train.csv', './0403/5da1382d4db8ce0c98bbe92e_train.csv', './0403/5

# Baseline 1: Ridge Regression

In [12]:
for n_files, file in enumerate(train_files):
  # read train and test set 
  data = pd.read_csv(file, index_col=0)
  test_data = pd.read_csv(test_files[n_files], index_col=0)

  # wifi, x, y, and f for training 
  wifi = data.iloc[:,:-5]
  x = data.iloc[:,-5]
  y =data.iloc[:,-4]
  floor = data.iloc[:,-3]

  # wifi, x, y, and f for testing
  wifi_val = test_data.iloc[:,:-5]
  x_val = test_data.iloc[:, -5]
  y_val =test_data.iloc[:, -4]
  floor_val = test_data.iloc[:, -3]

  # fit and predict x
  ridge_model_x = Ridge(alpha=100, fit_intercept=True)
  ridge_model_x.fit(wifi, x)
  x_pred = ridge_model_x.predict(wifi_val)

  # fit and predict y
  ridge_model_y = Ridge(alpha=100, fit_intercept=True)
  ridge_model_y.fit(wifi, y)
  y_pred = ridge_model_y.predict(wifi_val)

  #fit and predict f
  ridge_model_f = Ridge(alpha=100, fit_intercept=True)
  ridge_model_f.fit(wifi, floor)
  f_pred = ridge_model_f.predict(wifi_val)

  column_num = len(test_data.columns)
  test_data = test_data.reset_index()

  # insert columns for predicted x, y and f in test file
  test_data.insert(column_num, 'pred_x', x_pred, True)
  test_data.insert(column_num+1, 'pred_y', y_pred, True)
  test_data.insert(column_num+2, 'pred_f', f_pred, True)

  building_name = file[7:-10] + '.csv'

  # save new test file
  test_data.to_csv(building_name)


In [13]:
pred_files = glob.glob('/content' + '/*.csv')

In [15]:
# extract all true and predicted x, y and f
all_pred = pd.DataFrame()

for file_name in pred_files:
  file = pd.read_csv(file_name)

  all_pred = pd.concat([all_pred, file.iloc[:, -8:]])

In [16]:
# calculate overall MSE
comp_metric(all_pred['pred_x'], all_pred['pred_y'], all_pred['pred_f'], all_pred['x'], all_pred['y'], all_pred['f'])

18.151951182164154

# Baseline 2

In [None]:

from sklearn import linear_model

for n_files, file in enumerate(train_files):
  data = pd.read_csv(file, index_col=0)
  test_data = pd.read_csv(test_files[n_files], index_col=0)

  # wifi, x, y and f for training 
  wifi = data.iloc[:,:-5]
  x = data.iloc[:,-5]
  y =data.iloc[:,-4]
  floor = data.iloc[:,-3]

  # wifi, x, y, and f for testing
  wifi_val = test_data.iloc[:,:-5]
  x_val = test_data.iloc[:, -5]
  y_val =test_data.iloc[:, -4]
  floor_val = test_data.iloc[:, -3]

  # initiate three lasso models 
  clf_x = linear_model.Lasso(alpha=0.1, max_iter = 10000)
  clf_y = linear_model.Lasso(alpha=0.1, max_iter = 10000)
  clf_f = linear_model.Lasso(alpha=0.1, max_iter = 10000)

  clf_x.fit(wifi, x)
  clf_y.fit(wifi, y)
  clf_f.fit(wifi, floor)

  x_pred = clf_x.predict(wifi_val)
  y_pred = clf_y.predict(wifi_val)
  f_pred = clf_f.predict(wifi_val)

  column_num = len(test_data.columns)
  test_data = test_data.reset_index()

  # insert columns for predicted x, y and f in test file
  test_data.insert(column_num, 'pred_x', x_pred, True)
  test_data.insert(column_num+1, 'pred_y', y_pred, True)
  test_data.insert(column_num+2, 'pred_f', f_pred, True)

  building_name = file[7:-10] + '_clf.csv'

  test_data.to_csv(building_name)



  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


In [3]:
pred_files = glob.glob('/content' + '/*_clf.csv')

In [5]:
# extract all true and predicted x, y and f
all_pred = pd.DataFrame()

for file_name in pred_files:
  file = pd.read_csv(file_name)

  all_pred = pd.concat([all_pred, file.iloc[:, -8:]])

In [6]:
# calculate overall MSE
comp_metric(all_pred['pred_x'], all_pred['pred_y'], all_pred['pred_f'], all_pred['x'], all_pred['y'], all_pred['f'])

18.080036187766076