In [7]:
from scipy.stats import kurtosis
from scipy.stats import skew
import numpy as np
import csv
import datetime
from joblib import load

def feature_extraction(dist_con_weekly, interval_ind):
    kt = kurtosis(dist_con_weekly)
    sk = skew(dist_con_weekly)
    peak = max(dist_con_weekly)
    start = 0
    rec_len = 0
    rec_temp = []
    final_rec_len = []
    final_rec_number = []
    final_rec_avg = []
    
    #different unsupervised parameters for 15/30 intervals
    if interval_ind == 30:
        int_diff = 0.8
        diff_perc = 0.2
        len_al = 2
    else:
        int_diff = 0.4
        diff_perc = 0.2
        len_al = 3
    
    for i in range(1, len(dist_con_weekly)):
        #finding high sustained energy usage and record durations and average consumptions+
        if dist_con_weekly[i] - dist_con_weekly[i-1] > int_diff and start == 0:
            rec_temp = [dist_con_weekly[i]]
            start = i
        elif start != 0 and abs(dist_con_weekly[i]/dist_con_weekly[start] - 1) <= diff_perc:
            rec_temp.append(dist_con_weekly[i])
            rec_len += 1
        #elif dist_con_weekly[i] > 1 and start != 0 and (dist_con_weekly[i]/dist_con_weekly[start] - 1) > 0.1:
        #    rec_temp.append(dist_con_weekly[i])
        #    rec_len += 1
        else:
            if rec_len >= len_al:
                final_rec_len.append(rec_len)
                final_rec_number.append(rec_temp)
                final_rec_avg.append(np.mean(rec_temp))
                start = 0
                rec_len = 0
                rec_temp = []
            else:
                start = 0
                rec_len = 0
                rec_temp = []

    #print(np.mean(final_rec_len))
    return [sk,kt,peak,np.mean(final_rec_len),len(final_rec_len),np.mean(final_rec_avg)]

def interval(item):
    with open(item, 'r') as fin:
        r = csv.reader(fin)
        next(r)
        row_next = next(r)
        dt = row_next[4].split('T')[0] + ' ' + row_next[4].split('T')[1][:-5]
        temp_dt_1 = datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').minute
        row_next = next(r)
        dt = row_next[4].split('T')[0] + ' ' + row_next[4].split('T')[1][:-5]
        temp_dt_2 = datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').minute
        #print(temp_dt_1, temp_dt_2)
        if abs(temp_dt_2 - temp_dt_1) == 15:
            interval_ind = 15
        elif abs(temp_dt_2 - temp_dt_1) == 30:
            interval_ind = 30
        else:
            interval_ind = 0         
        return interval_ind

def compressing_file_unsupervised(item, interval_ind):
    with open(item, 'r') as fin:
        nan_cnt = 0
        ev_sig_cnt = 0
        features = []
        r = csv.reader(fin)
        next(r)
        dist_con_weekly = []
        temp_timestamp = []
        history = []
        weekly_features = []
        file_id = []
        for row in r:
            #make sure it's the same house
            if row[0] not in file_id:
                file_id.append(row[0])
            #reorganize timestamp making it python readable
            dt = row[4].split('T')[0] + ' ' + row[4].split('T')[1][:-5]
            #print(dt)
            #trying to see the length of the data
            history_factor = 2019 - datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').year
            if history_factor == 0:
                history_factor = 0.5
            history.append(history_factor)
            if 3 <= datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').month <= 5 or 9 <= datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').month <= 11:
                wd = datetime.date(int(row[7]), int(row[8]), int(row[9])).weekday()
                #print(wd)
                #discard weekends...
                if wd == 6 or wd == 0:
                    if dist_con_weekly != []:
                        line = feature_extraction(dist_con_weekly, interval_ind)
                        if np.isnan(line[2]) == True or np.isnan(line[3]) == True:
                            nan_cnt += 1
                            continue
                        else:
                            ev_sig_cnt += 1
                            weekly_features.append(line)
                        dist_con_weekly = []
                    else:
                        continue
                else:
                    con = float(row[1])
                    #treating duplicate values...
                    if dt not in temp_timestamp:
                        temp_timestamp.append(dt)
                        dist_con_weekly.append(con)
                    else:
                        continue
            else:
                continue
    #calculate final compressed features mean
    output_feature = np.mean(weekly_features, axis=0)
    
    #showing selection scores and null/score ratios, important to implement cutoff rate
    if interval_ind == 15:
        ratio_threshold = 0.004
    elif interval_ind == 30:
        ratio_threshold = 0.008
    else:
        ratio_threshold = 0
    #calculate unsupervised edge scores...
    if nan_cnt > 0:
        freq_ratio = float(ev_sig_cnt/nan_cnt)
        factor = float(freq_ratio/history[0])
        if factor >= ratio_threshold:
            f = factor
        else:
            f = 0
    else:
        f = 0
    #final sanity check on multipe file id in one file...
    if len(file_id) == 1:
        f_id = file_id[0]
        return f_id, output_feature, f
    else:
        return 'none', output_feature, 0
    #print('finished')

def learning_infer(features, interval_ind):        
    features = np.array(features).reshape(1, -1)
    #loading trained models
    if interval_ind == 30:
        #standardizing
        scaler = load('scaler_30.joblib')
        features_transformed = scaler.transform(features)
        clf_svc = load('trained_ev_detect_svm_30.joblib')
        eclf = load('trained_ev_detect_ensemble_30.joblib')
    elif interval_ind == 15:
        scaler = load('scaler_15.joblib')
        features_transformed = scaler.transform(features)
        clf_svc = load('trained_ev_detect_svm_15.joblib')
        eclf = load('trained_ev_detect_ensemble_15.joblib')
    else:
        return 0
    ensemble_Y = eclf.predict(features_transformed)
    svm_Y = clf_svc.predict(features_transformed)
    #if they are not the same...
    return ensemble_Y

if __name__ == '__main__':
    #file reading
    item = 'location_id=00000000-0000-000b-012c-135efde15403.csv'
    #item = 'location_id=00000000-0000-000b-0264-6d7e5f610404.csv'
    interval_ind = interval(item)
    #print(interval_ind)
    fid, output_feature, f = compressing_file_unsupervised(item, interval_ind)
    print(fid)
    bin_output = learning_infer(output_feature, interval_ind)
    print(bin_output+f)
    

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


00000000-0000-000b-012c-135efde15401
[1.]
