# From csv to npz

In [1]:
import csv
import numpy as np

## Constants

In [2]:
data = '../../../data/processed/temp_open_utc_complete.csv'
meta = '../../../data/raw/meta_open.csv'
labeled_data = '../../../data/processed/labeled_data.npz'
TS_COL, NAMES_ROW = 0, 0
# From csv to npz
# UID, DE, DS, IND, PSU, SUBIND, TZ = 0, 1, 2, 5, 9, 13, 14

## Electrical Data

In [3]:
with open(data, 'r') as file:
    reader = csv.reader(file)
    data_rows = [row for row in reader]

In [4]:
names = data_rows[NAMES_ROW][TS_COL + 1:]
name2col = {name : 1 + col for (col, name) in enumerate(names)}
name2col

{'Office_Cristina': 1,
 'PrimClass_Jolie': 2,
 'PrimClass_Jaylin': 3,
 'Office_Jesus': 4,
 'PrimClass_Uma': 5,
 'UnivClass_Tamra': 6,
 'PrimClass_Jayla': 7,
 'PrimClass_Janiya': 8,
 'PrimClass_Umar': 9,
 'PrimClass_Janice': 10,
 'Office_Jett': 11,
 'UnivDorm_Una': 12,
 'UnivLab_Paul': 13,
 'Office_Jerry': 14,
 'PrimClass_Uriah': 15,
 'PrimClass_Ulysses': 16,
 'Office_Lesa': 17,
 'UnivDorm_Claudia': 18,
 'UnivClass_Tammy': 19,
 'PrimClass_Jaden': 20,
 'PrimClass_Jermaine': 21,
 'PrimClass_Josephine': 22,
 'Office_Jackie': 23,
 'UnivDorm_Carla': 24,
 'UnivClass_Camden': 25,
 'PrimClass_Javier': 26,
 'PrimClass_Jeanette': 27,
 'PrimClass_Julius': 28,
 'PrimClass_Jaylinn': 29,
 'PrimClass_Johanna': 30,
 'PrimClass_Jodie': 31,
 'PrimClass_Johnathan': 32,
 'PrimClass_Janis': 33,
 'PrimClass_Jamal': 34,
 'PrimClass_Jose': 35,
 'PrimClass_Julianna': 36,
 'PrimClass_Jasmine': 37,
 'PrimClass_Jazmine': 38,
 'PrimClass_Justin': 39,
 'Office_Marla': 40,
 'PrimClass_Jody': 41,
 'PrimClass_Julianne'

## Metadata

In [5]:
with open(meta, 'r') as file:
    reader = csv.reader(file)
    meta_rows = [row for row in reader]

In [6]:
names = [row[0] for row in meta_rows]
names = names[1:]
names2row = {name : 1 + row for (row, name) in enumerate(names)}
names2row

{'PrimClass_Everett': 1,
 'UnivClass_Clifford': 2,
 'Office_Elizabeth': 3,
 'Office_Ellie': 4,
 'PrimClass_Elisabeth': 5,
 'Office_Cristina': 6,
 'PrimClass_Jolie': 7,
 'PrimClass_Jaylin': 8,
 'Office_Jesus': 9,
 'PrimClass_Esmeralda': 10,
 'PrimClass_Eoghan': 11,
 'PrimClass_Edwin': 12,
 'PrimClass_Eli': 13,
 'PrimClass_Ethel': 14,
 'PrimClass_Ernesto': 15,
 'PrimClass_Emanuela': 16,
 'PrimClass_Emilio': 17,
 'PrimClass_Eleanor': 18,
 'PrimClass_Ezekiel': 19,
 'PrimClass_Elliott': 20,
 'PrimClass_Ellen': 21,
 'PrimClass_Evie': 22,
 'PrimClass_Elijah': 23,
 'PrimClass_Ezra': 24,
 'PrimClass_Edmund': 25,
 'PrimClass_Eva': 26,
 'Office_Erik': 27,
 'PrimClass_Ebony': 28,
 'PrimClass_Ethan': 29,
 'PrimClass_Elmer': 30,
 'PrimClass_Ervin': 31,
 'PrimClass_Uma': 32,
 'UnivClass_Tamra': 33,
 'PrimClass_Ernest': 34,
 'PrimClass_Emily': 35,
 'Office_Evelyn': 36,
 'PrimClass_Jayla': 37,
 'Office_Emer': 38,
 'PrimClass_Janiya': 39,
 'PrimClass_Umar': 40,
 'Office_Elena': 41,
 'PrimClass_Janice': 

## Start Dates

In [7]:
start_dates = [row[2] for row in meta_rows]
start_dates = start_dates[1:]
start_dates = set(start_dates)
start_dates

{'01/01/10 00:00',
 '01/01/12 00:00',
 '01/01/13 00:00',
 '01/01/15 00:00',
 '01/05/14 00:00',
 '01/11/14 00:00',
 '01/12/14 00:00',
 '02/02/12 00:00'}

## Preprocessing Electrical Data

In [8]:
electrical_values = [None] * len(names2row) # 507
offsets = {}
offsets['05'] = 24 * (31 + 30 + 31 + 31 + 30 + 31 + 30 + 31)  # 01/05/14
offsets['11'] = 24 * (30 + 31) # 01/11/14
offsets['12'] = 24 * 31 # 01/12/14
offsets['02'] = 24 * (27 + 31 + 30) + offsets['05'] # 02/02/12

for i, (name, row) in enumerate(names2row.items()):
    col = name2col[name]
    ds = meta_rows[row][2]
    vals = [row[col] for row in data_rows]
    vals = [val for val in vals if val]
    vals = vals[1:]
    vals = [round(float(val), 3) for val in vals]
    
    if len(vals) == 8784: # Leap year
        if ds[1] == '1': # 1/1/2012
            vals = vals[:59*24] + vals[60*24:]
        else: # 2/2/2012
            vals = vals[:27*24] + vals[28*24:]
    electrical_values[i] = vals
    
    month = ds[3:5]
    if month != '01': # Data don't begin at the beginning of the year
        offset = offsets[month]
        old_first_val = vals[0]
        vals = vals[offset:] + vals[:offset]

electrical_values = np.array(electrical_values)
print(electrical_values)
print(np.min(electrical_values))
print(np.max(electrical_values))
print(np.mean(electrical_values))
print(np.std(electrical_values))

[[1.847000e+00 1.759000e+00 1.935000e+00 ... 2.033000e+00 1.897000e+00
  1.860000e+00]
 [1.370000e+00 1.355000e+00 1.327000e+00 ... 1.200000e+00 1.225000e+00
  1.200000e+00]
 [3.160000e+00 3.167000e+00 3.167000e+00 ... 4.725000e+00 4.678000e+00
  4.670000e+00]
 ...
 [1.381000e+02 1.373000e+02 1.424000e+02 ... 1.310000e+02 1.302000e+02
  1.306000e+02]
 [2.233500e+02 2.183250e+02 2.175000e+02 ... 2.586250e+02 2.472500e+02
  2.546750e+02]
 [2.295820e+03 2.268890e+03 2.296070e+03 ... 2.154122e+03 2.034650e+03
  2.030860e+03]]
0.001
3150.06
121.29173741860531
207.9465511640876


## Preprocessing Metadata

In [9]:
industries = {'Education' : 0, 'Government' : 1, 'Commercial Property' : 2}
subindustries = {'College/University' : 3, 'Primary/Secondary School' : 4, 'City, County, State' : 5,  \
                 'Other Government Buildings' : 6, 'Commercial Real Estate' : 7, 'Business Services' : 8, \
                 'Bank/Financial Services' : 9, 'Corporate Office' : 10, 'Social Services' : 11}
primary_use = {'Office' : 12, 'Primary/Secondary Classroom' : 13, 'College Laboratory' : 14, \
               'College Classroom' : 15, 'Dormitory' : 16}
continents = {'America' : 17, 'Europe' : 18, 'Asia' : 19}
countries = {'New_York' : 20, 'London' : 21, 'Phoenix' : 22, 'Chicago' : 23, 'Los_Angeles' : 24, \
             'Zurich' : 25, 'Singapore' : 26, 'Denver' : 27}

labels = [None] * len(names2row) # 507
counts = {i : 0 for i in range(28)}

for i, (name, row) in enumerate(names2row.items()):
    metadata = meta_rows[row]
    ind, psu, sub, tz = metadata[5], metadata[9], metadata[13], metadata[14]
    continent, country = tz.split('/')[0], tz.split('/')[1]
    label = np.array([industries[ind], subindustries[sub], primary_use[psu], continents[continent], countries[country]])
    labels[i] = label
    for nr in label:
        counts[nr] += 1
    print(tz, name, label)

labels = np.array(labels)
print(labels)
print(counts)

America/New_York PrimClass_Everett [ 0  4 13 17 20]
America/New_York UnivClass_Clifford [ 0  3 15 17 20]
America/Los_Angeles Office_Elizabeth [ 2  7 12 17 24]
America/Los_Angeles Office_Ellie [ 2  9 12 17 24]
America/New_York PrimClass_Elisabeth [ 0  4 13 17 20]
America/New_York Office_Cristina [ 0  3 12 17 20]
Europe/London PrimClass_Jolie [ 0  4 13 18 21]
Europe/London PrimClass_Jaylin [ 0  4 13 18 21]
Europe/London Office_Jesus [ 1  5 12 18 21]
America/New_York PrimClass_Esmeralda [ 0  4 13 17 20]
America/New_York PrimClass_Eoghan [ 0  4 13 17 20]
America/Denver PrimClass_Edwin [ 0  4 13 17 27]
America/New_York PrimClass_Eli [ 0  4 13 17 20]
America/Denver PrimClass_Ethel [ 0  4 13 17 27]
America/Denver PrimClass_Ernesto [ 0  4 13 17 27]
America/Chicago PrimClass_Emanuela [ 0  4 13 17 23]
America/New_York PrimClass_Emilio [ 0  4 13 17 20]
America/New_York PrimClass_Eleanor [ 0  4 13 17 20]
America/New_York PrimClass_Ezekiel [ 0  4 13 17 20]
America/New_York PrimClass_Elliott [ 0  4 

## Saving the results

In [10]:
np.savez_compressed(labeled_data, electrical_values=electrical_values, labels=labels)