In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
houses = ['A', 'B', 'C']
xtype_dict = {'r':'raw', 'c':'change'}

In [3]:
def read_data(house, xtype):
    assert house in houses
    assert xtype in xtype_dict
    
    act_df = pd.read_csv("house{}_act.csv".format(house))
    sensor_df = pd.read_csv("house{}_sensor.csv".format(house))
    X = np.load("X_{}_house{}.npy".format(xtype_dict[xtype], house))
    Y = np.load("Y_house{}.npy".format(house))
    return act_df, sensor_df, X, Y

In [4]:
#read in data
house = 'C'
t = 'c'
act_df,sensor_df,X,Y = read_data(house, t)

In [5]:
#build 4 dict, act to idx, idx to act, sensor to idx, idx to sensor
all_act = list(set(act_df.label)) + [0] #all the act labels
a2i = dict(zip(all_act, range(len(all_act))))
i2a = dict(zip(range(len(all_act)), all_act))

all_sensor = list(set(sensor_df.label)) #all the sensor labels
i2s = dict(zip(range(len(all_sensor)), all_sensor))
s2i = dict(zip(all_sensor, range(len(all_sensor))))

In [6]:
print "# sensors: ", len(all_sensor)
print "# states/acts: ", len(all_act)
print "# timeframes: ", len(X)

# sensors:  21
# states/acts:  18
# timeframes:  26488


In [7]:
all_act

[1.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 22.0,
 28.0,
 0]

In [8]:
ls

C_Prior_A.npy                 Raw_Prior_A.npy               X_raw_houseB.npy              houseB_act.csv
C_Prior_B.npy                 Raw_Prior_B.npy               X_raw_houseC.npy              houseB_sensor.csv
C_Prior_C.npy                 Raw_Prior_C.npy               Y_houseA.npy                  houseC_act.csv
Feature_Representation.ipynb  Starter_Code.ipynb            Y_houseB.npy                  houseC_sensor.csv
Naive_Bayes_Part1.ipynb       X_change_houseA.npy           Y_houseC.npy                  muA.npy
Naive_Bayes_Part2.ipynb       X_change_houseB.npy           [34mdatasets[m[m/                     muB.npy
Preprocessing.ipynb           X_change_houseC.npy           houseA_act.csv                muC.npy
RAW_PRIOR_new.ipynb           X_raw_houseA.npy              houseA_sensor.csv


In [9]:
priorA = np.load('Raw_Prior_C.npy')

In [10]:
X[Y == all_act[0]]

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [11]:
miu = np.zeros([len(all_act), len(all_sensor)])
for row in X[Y == all_act[0]]:
    miu[0] += row

In [12]:
sum(Y == all_act[0])

11961

In [13]:
miu[0]/11961

array([  8.36050497e-05,   0.00000000e+00,   0.00000000e+00,
         7.02282418e-03,   0.00000000e+00,   0.00000000e+00,
         2.50815149e-04,   0.00000000e+00,   1.67210099e-03,
         1.67210099e-04,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   1.67210099e-04,
         0.00000000e+00,   0.00000000e+00,   7.52445448e-04,
         8.36050497e-05,   0.00000000e+00,   0.00000000e+00])

## Naive Bayes

In [14]:
#each row represents sensors for a specific action i
#each column is sensor j
def calc_miu(X, Y, all_sensor=all_sensor, all_act=all_act):
    assert len(X)==len(Y)
    
    num_act = len(all_act)
    num_sensor = len(all_sensor)
    miu = np.zeros([num_act, num_sensor])
    for act in all_act:
        sensor = X[Y==act]
        for row in sensor:
            miu[a2i[act]] += row
        total = np.sum(Y==act)
        if total != 0:
            miu[a2i[act]] = miu[a2i[act]]/total
#         else:
#             miu[a2i[act]] = [1e-10]*len(miu[i])
    #normalize
#     for i,row in enumerate(miu):
#         total = np.sum(row)
#         if total != 0:
#             miu[i] = miu[i]/total
#         else:
#             miu[i] = [1e-10]*len(miu[i])
        
    return miu

In [15]:
def max_like(miu, X, prior):
    num_time = len(X)
    num_act = len(miu)
    prob = np.zeros((num_time, num_act))
    for i in range(num_time):
        sensor = X[i]
        for j in range(num_act):
            act = miu[j]
            logp = 0
            for k,s in enumerate(sensor):
                if s == 1.:
                    logp += np.log(act[k])
                else:
                    logp += np.log(1-act[k])
            prob[i,j] = logp + np.log(prior[i , j])
    return prob

In [16]:
def mle(matrix):
    est = np.zeros(len(matrix))
    for i,row in enumerate(matrix):
        est[i] = np.argmax(row)
    return est

In [17]:
miu = calc_miu(X,Y)
print priorA.shape
likes = max_like(miu,X,priorA)
result = mle(likes) #note: result is index, not label yet

(26488, 18)


In [18]:
likes.shape

(26488, 18)

In [19]:
import seaborn as sns

In [20]:
#convert index to label
res_label = np.array([i2a[e] for e in result])
for i,act in enumerate(all_act):
    print "{},act label: {}, Y count:{}, estimate count:{}".format(i,act, sum(Y==act), sum(res_label==act))

0,act label: 1.0, Y count:11961, estimate count:14076
1,act label: 3.0, Y count:352, estimate count:44
2,act label: 4.0, Y count:78, estimate count:237
3,act label: 5.0, Y count:192, estimate count:406
4,act label: 6.0, Y count:92, estimate count:24
5,act label: 7.0, Y count:56, estimate count:133
6,act label: 9.0, Y count:62, estimate count:55
7,act label: 10.0, Y count:7760, estimate count:8975
8,act label: 11.0, Y count:88, estimate count:90
9,act label: 12.0, Y count:9, estimate count:149
10,act label: 13.0, Y count:83, estimate count:639
11,act label: 14.0, Y count:65, estimate count:30
12,act label: 15.0, Y count:306, estimate count:110
13,act label: 16.0, Y count:8, estimate count:67
14,act label: 17.0, Y count:22, estimate count:89
15,act label: 22.0, Y count:40, estimate count:38
16,act label: 28.0, Y count:2457, estimate count:286
17,act label: 0, Y count:2857, estimate count:1040


In [21]:
pwd

u'/Users/yuhantang/AM207Project/correct/am207finalproject/data'

# Evaluation

$$\text{Precision} = \frac{1}{N}\sum^N_i \frac{TP_i}{TI_i} $$
$$\text{Recall} = \frac{1}{N}\sum^N_i\frac{TP_i}{TT_i} $$
$$\text{F-Measure} = \frac{2 \cdot precision \cdot recal}{precision+recall} $$
$$\text{Accuracy} = \frac{\sum^N_i TP_i}{Total}$$

In [22]:
def precision(pred_label, Y):
    all_label = list(set(Y))
    N = len(all_label)
    res = 0
    for y in list(set(Y)):
        TP = np.sum(pred_label[Y==y]==y)
        TI = np.sum(pred_label==y)
        if TI != 0:
            res += (float(TP)/TI)
    return float(res)/N

def recall(pred_label, Y):
    all_label = list(set(Y))
    N = len(all_label)
    res = 0
    for y in list(set(Y)):
        TP = np.sum(pred_label[Y==y]==y)
        TT = np.sum(Y==y)
        if TT != 0:
            res += float(TP)/TT
    return float(res)/N

def f_score(pred_label, Y):
    p = precision(pred_label, Y)
    r = recall(pred_label, Y)
    return 2*p*r/(p+r)

def accuracy(pred_label, Y):
    res = 0
    all_label = list(set(Y))
    for y in list(set(Y)):
        TP = np.sum(pred_label[Y==y]==y)
        res += TP
    return float(res)/len(Y)

In [23]:
precision(res_label, Y)

0.19665418149626068

In [24]:
recall(res_label, Y)

0.232955161867768

In [25]:
f_score(res_label, Y)

0.21327099789641424

In [26]:
accuracy(res_label, Y)

0.5889459377831471

In [27]:
# #save miu
np.save("mu{}.npy".format(house), miu)

In [28]:
miu.shape

(18, 21)