In [1]:
import pandas as pd

In [2]:
DATA_DIR = './data/'
EXCHANGE_RATE = 6.784829586

In [3]:
df = pd.read_csv('{}Train_Data.csv'.format(DATA_DIR))
df.head()

Unnamed: 0,date,number of rooms,security level of the community,residence space,building space,noise level,waterfront,view,air quality level,aboveground space,...,building year,decoration year,district,city,zip code,region,exchange rate,unit price of residence space,unit price of building space,total cost
0,2014/5/2 0:00,5,2.5,2820,67518,2.0,0,0,3,2820,...,1979,2014,23525 SE 32nd Way,Issaquah,WA 98029,USA,6.78483,11.886409,0.977028,
1,2014/5/2 0:00,2,1.0,1210,9400,1.0,0,0,2,1210,...,1949,0,7542 21st Ave SW,Seattle,WA 98106,USA,6.243129,25.681414,0.698603,
2,2014/5/2 0:00,4,2.5,2200,9397,2.0,0,0,3,2200,...,1987,2000,5214 S 292nd St,Auburn,WA 98001,USA,6.010127,16.921174,0.238882,
3,2014/5/2 0:00,3,1.5,1200,9720,1.0,0,0,4,1200,...,1965,0,14034 SE 201st St,Kent,WA 98042,USA,7.560375,19.63223,0.876178,
4,2014/5/2 0:00,3,1.75,1370,5858,1.0,0,0,3,1370,...,1987,2000,1605 S 245th Pl,Des Moines,WA 98198,USA,6.543941,28.20593,0.132472,


# split dataset

In [4]:
data = df.to_numpy().tolist()
print('{}, {}'.format(len(data), len(data[0])))

4000, 21


In [5]:
size = len(data)

In [6]:
NUM_MAPPER = 5
NUM_REDUCER = 2
CHUNK_SIZE = size // NUM_MAPPER

In [7]:
chunks = [data[i: i+CHUNK_SIZE] for i in range(0, size, CHUNK_SIZE)]
print('{}, {}'.format(len(chunks), len(chunks[0])))

5, 800


# define map & reduce

In [8]:
def mapper(k, v):
    ret = list()
    for i,r in enumerate(v):
        key = k*CHUNK_SIZE+i
        ret.append((key, EXCHANGE_RATE * (r[3]*r[18])))    # total cost in residence
        ret.append((key, EXCHANGE_RATE * (r[4]*r[19])))    # total cost in building
    return ret


def reducer(k, v):
    return k, sum(v)    # total cost of residence and building



In [9]:
middle_result = [mapper(k,v) for k,v in enumerate(chunks)]
middle_result

[[(0, 227425.27644498547),
  (0, 447574.72365968407),
  (1, 210835.26078571047),
  (1, 44555.09880684265),
  (2, 252576.02305391122),
  (2, 15230.384527372828),
  (3, 159841.60266454972),
  (3, 57782.676268919684),
  (4, 262180.222583764),
  (4, 5265.165466839717),
  (5, 260736.02935725008),
  (5, 3141.6391173255233),
  (6, 174055.69237865944),
  (6, 65268.78057074442),
  (7, 239269.6192545785),
  (7, 14196.193404418973),
  (8, 236467.05059779919),
  (8, 12537.821654695199),
  (9, 264348.38057683216),
  (9, 641.237267112973),
  (10, 270587.6810855846),
  (10, 23893.33174567122),
  (11, 333588.16858434724),
  (11, 1325.3914290046519),
  (12, 263144.60395462636),
  (12, 29583.96379052549),
  (13, 309367.5778006185),
  (13, 9211.642643026384),
  (14, 325019.64018286444),
  (14, 12203.609633762153),
  (15, 272937.8102761653),
  (15, 40392.00317399334),
  (16, 371553.94364333345),
  (16, 26527.02416710393),
  (17, 323423.2078401017),
  (17, 20659.975163713752),
  (18, 349358.2917582335),
  

## merge identical keys

In [10]:
from itertools import chain
total_middle_result = list(chain(*middle_result))
kv_pairs = [(total_middle_result[i][0], [total_middle_result[i][1], total_middle_result[i+1][1]]) for i in range(0, len(total_middle_result), 2)]
kv_pairs

[(0, [227425.27644498547, 447574.72365968407]),
 (1, [210835.26078571047, 44555.09880684265]),
 (2, [252576.02305391122, 15230.384527372828]),
 (3, [159841.60266454972, 57782.676268919684]),
 (4, [262180.222583764, 5265.165466839717]),
 (5, [260736.02935725008, 3141.6391173255233]),
 (6, [174055.69237865944, 65268.78057074442]),
 (7, [239269.6192545785, 14196.193404418973]),
 (8, [236467.05059779919, 12537.821654695199]),
 (9, [264348.38057683216, 641.237267112973]),
 (10, [270587.6810855846, 23893.33174567122]),
 (11, [333588.16858434724, 1325.3914290046519]),
 (12, [263144.60395462636, 29583.96379052549]),
 (13, [309367.5778006185, 9211.642643026384]),
 (14, [325019.64018286444, 12203.609633762153]),
 (15, [272937.8102761653, 40392.00317399334]),
 (16, [371553.94364333345, 26527.02416710393]),
 (17, [323423.2078401017, 20659.975163713752]),
 (18, [349358.2917582335, 19925.200878120948]),
 (19, [388764.1015952197, 356.7126554158786]),
 (20, [313602.0675216256, 39221.64672360722]),
 (2

## shuffle middle results

In [11]:
import random

In [12]:
random.seed(42)
mixed_result = kv_pairs
random.shuffle(mixed_result)
mixed_result

[(1823, [488942.526714372, 2480.221188320391]),
 (228, [200909.61787111795, 99719.85305744899]),
 (3220, [339851.9867132611, 104256.46850797514]),
 (3202, [293295.62623724726, 91945.50928812097]),
 (2729, [421950.401104365, 13221.658165412338]),
 (2784, [1120517.9415385902, 49221.12571699785]),
 (2938, [235390.73673423985, 27858.322857394764]),
 (3103, [447139.4328538601, 10193.479588567183]),
 (3693, [378146.580468935, 15537.61409644996]),
 (2798, [446600.93947255856, 34307.69017789728]),
 (2588, [497629.0342764965, 60337.28451315309]),
 (2476, [468919.05801069475, 33774.221478171254]),
 (3742, [629953.4450579844, 255651.70072044525]),
 (2951, [253521.563362817, 37665.44053529977]),
 (1198, [140103.2516068349, 46528.64638280589]),
 (1136, [291644.84903853823, 21160.04676237471]),
 (1596, [729393.7208490899, 26840.109315025235]),
 (3854, [962567.3511190888, 33879.185765053626]),
 (2812, [207167.42509017437, 12529.43051727464]),
 (2142, [547431.5690830029, 5502.358001600844]),
 (668, [3

In [13]:
MIDDLE_CHUNK_SIZE = len(mixed_result) // NUM_REDUCER
partial_middle_result = [mixed_result[i: i + MIDDLE_CHUNK_SIZE] for i in range(0, len(mixed_result), MIDDLE_CHUNK_SIZE)]
print('{}, {}'.format(len(partial_middle_result), len(partial_middle_result[0])))


2, 2000


In [14]:
result = [reducer(*kv) for p in partial_middle_result for kv in p]
len(result)

4000

In [15]:
r = [e[1] for e in sorted(result, key=lambda x: x[0])]
r

[675000.0001046695,
 255390.3595925531,
 267806.40758128406,
 217624.2789334694,
 267445.38805060374,
 263877.6684745756,
 239324.47294940386,
 253465.81265899746,
 249004.8722524944,
 264989.61784394516,
 294481.01283125585,
 334913.5600133519,
 292728.5677451519,
 318579.22044364485,
 337223.2498166266,
 313329.81345015863,
 398080.9678104374,
 344083.1830038154,
 369283.49263635447,
 389120.81425063557,
 352823.7142452328,
 433479.4820612562,
 359712.13800176675,
 447396.70531508327,
 452934.0470405948,
 439979.18353162333,
 491466.59417677857,
 433725.9240142177,
 441491.57264302205,
 412609.60210476146,
 463972.6310524182,
 419867.9993277469,
 424711.0708237132,
 431252.06468569237,
 494874.0504461494,
 434315.0036152912,
 518518.8333397299,
 487695.16036661936,
 568574.8897585329,
 493464.3303985267,
 473099.3372065611,
 562770.9837784831,
 611596.2088938078,
 601018.520088003,
 660480.2454970602,
 618179.9832870185,
 666670.3231560703,
 642996.9587121352,
 704175.1322436046,
 57

# save result

In [16]:
s = pd.Series(r, dtype='float')

In [17]:
df['total cost'] = s

In [18]:
df.head()

Unnamed: 0,date,number of rooms,security level of the community,residence space,building space,noise level,waterfront,view,air quality level,aboveground space,...,building year,decoration year,district,city,zip code,region,exchange rate,unit price of residence space,unit price of building space,total cost
0,2014/5/2 0:00,5,2.5,2820,67518,2.0,0,0,3,2820,...,1979,2014,23525 SE 32nd Way,Issaquah,WA 98029,USA,6.78483,11.886409,0.977028,675000.000105
1,2014/5/2 0:00,2,1.0,1210,9400,1.0,0,0,2,1210,...,1949,0,7542 21st Ave SW,Seattle,WA 98106,USA,6.243129,25.681414,0.698603,255390.359593
2,2014/5/2 0:00,4,2.5,2200,9397,2.0,0,0,3,2200,...,1987,2000,5214 S 292nd St,Auburn,WA 98001,USA,6.010127,16.921174,0.238882,267806.407581
3,2014/5/2 0:00,3,1.5,1200,9720,1.0,0,0,4,1200,...,1965,0,14034 SE 201st St,Kent,WA 98042,USA,7.560375,19.63223,0.876178,217624.278933
4,2014/5/2 0:00,3,1.75,1370,5858,1.0,0,0,3,1370,...,1987,2000,1605 S 245th Pl,Des Moines,WA 98198,USA,6.543941,28.20593,0.132472,267445.388051


In [19]:
df.to_csv('{}Train_data_completed.csv'.format(DATA_DIR), index=False)