In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from datetime import datetime, timedelta
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
import matplotlib.cm as cm
import matplotlib

In [2]:
sql = create_engine('postgresql://jill:teddybear1234@ec2-54-173-59-124.compute-1.amazonaws.com/capstone')

In [3]:
model_number = 2

query = '''SELECT * from sensor_readings_model{}_1hr where psn is not null and timestamp is not null'''.format(model_number)
model_data = pd.read_sql(query,sql)

data_dictionary = pd.read_csv('data_dictionary_model{}.csv'.format(model_number))
data_dictionary['tag'] = data_dictionary['tag'].str.lower()
data_dictionary = data_dictionary.set_index('tag')

In [4]:
skipped_cols = ['sum_esn']
index_cols = ['id','timestamp','psn']
data_cols = [c for c in model_data.columns if (c not in index_cols) and (c not in skipped_cols)]

In [5]:
missing_values = model_data.isnull().sum().sort_values()
print(missing_values)
sparse_cols = [x for x in missing_values.index if missing_values[x] > 30000]
clean_data_cols = [x for x in data_cols if x not in sparse_cols]

id                 0
lo_t6              0
t5_a               0
t1_1               0
lo_p1              0
perf_pow           0
timestamp          0
ngp                0
sum_eng_st         0
sum_esn            0
sum_eng_h          0
pcd                0
psn                0
lo_c_dt5           1
g_t1               1
g_t2               1
lo_c_dp1           1
t5_5               1
lo_t5              1
lo_t9              1
t5_6               1
lo_c_brg1          1
c_dt5_6            1
c_dt5_5            1
v_d_1b             1
v_d_2b             1
v_d_3b             1
v_acc1             1
lo_dp1             1
t5_1               6
               ...  
g_cur4             6
c_c_t5_1           6
c_c_t5_2           6
c_c_t5_3           6
g_cur1             6
g_cur2             6
g_cur3             6
g_pow1             6
sum_enr            6
pe_c_pos_e1        6
f_c_pos_e1         6
c_dt5_4            6
pe_for1            6
f_c_dp5            6
nt5                6
f_c_dp2            6
g_pct1       

In [18]:
data = model_data[index_cols + clean_data_cols].dropna().reset_index()
clean_data = StandardScaler().fit_transform(data[clean_data_cols])

pca =  PCA().fit(clean_data)
reduced = pca.transform(clean_data)
print(data.head())

   index      id                  timestamp  psn   pe_cmd1    f_cmd2  \
0      0  156635 2016-05-25 11:00:00.000003   34  0.868463  0.594895   
1      1  156636 2016-05-25 12:00:00.000000   34  0.868627  0.591983   
2      2  156637 2016-05-25 12:59:59.999996   34  0.868627  0.588229   
3      3  156638 2016-05-25 14:00:00.000003   34  0.868590  0.588949   
4      4  156639 2016-05-25 15:00:00.000000   34  0.868627  0.591038   

     f_cmd1  lo_c_dp1  f_c_dp1   f_c_dp2   ...       g_t3    g_t4    g_t5  \
0  0.131234    0.2624    0.110  0.419925   ...     0.5456  0.5480  0.5308   
1  0.130570    0.2452    0.034  0.340475   ...     0.5540  0.5492  0.5348   
2  0.130135    0.2760    0.046  0.431250   ...     0.5600  0.5556  0.5408   
3  0.130259    0.2808    0.068  0.346150   ...     0.5748  0.5636  0.5536   
4  0.130476    0.2656    0.120  0.295075   ...     0.5864  0.5704  0.5544   

      lo_t5     lo_t6     lo_t9    v_acc1    v_d_1b    v_d_2b   v_d_3b  
0  0.837826  0.974667  0.905263

In [11]:
def round_to_hour(dt):
    dt_start_of_hour = dt.replace(minute=0, second=0, microsecond=0)
    dt_half_hour = dt.replace(minute=30, second=0, microsecond=0)

    if dt >= dt_half_hour:
        # round up
        dt = dt_start_of_hour + timedelta(hours=1)
    else:
        # round down
        dt = dt_start_of_hour

    return dt

In [16]:
plt.rcParams["figure.figsize"] = (15,35)

for psn in data['psn'].sort_values().unique():
    psn_data = pd.DataFrame(data[(data['psn'] == psn)])
    
    psn_data['iso'] = psn_data['timestamp'].apply(lambda x: x.isocalendar())
    psn_data['week'] = psn_data['timestamp'].apply(lambda x: (x.isocalendar()[0],x.isocalendar()[1]))
   
    complete_days = (psn_data.groupby(by=['iso']).count()['id'] > 21)
    psn_data['complete_day'] = psn_data['iso'].apply(lambda x: complete_days[x])
    psn_data = psn_data[psn_data['complete_day'] == True]
    
    complete_weeks = (psn_data.groupby(by=['week']).count()['id'] >= 160)
    psn_data['complete_week'] = psn_data['week'].apply(lambda x: complete_weeks[x])
    psn_data = psn_data[psn_data['complete_week'] == True]
   
    
    for w in psn_data['week'].unique():  
        weekly_data = psn_data[psn_data['week'] == w]
        weekly_data =  weekly_data.sort_values(by=['timestamp'])
         
        fig = plt.figure(0)
        
        grid_size = (11,4)
        ax_legend = plt.subplot2grid(grid_size,(0,0), colspan=4)
        ax1w = plt.subplot2grid(grid_size, (1, 0), colspan=2)
        ax2w = plt.subplot2grid(grid_size, (1, 2), colspan=2, sharex=ax1w)
        ax1d = plt.subplot2grid(grid_size, (2, 0), colspan=2)
        ax2d = plt.subplot2grid(grid_size, (2, 2), colspan=2, sharex=ax1d)
        
        ax_mon = plt.subplot2grid(grid_size, (3, 0), colspan=2, rowspan=2)
        ax_tue = plt.subplot2grid(grid_size, (3, 2), colspan=2, rowspan=2, sharex=ax_mon, sharey=ax_mon)
        ax_wed = plt.subplot2grid(grid_size, (5, 0), colspan=2, rowspan=2, sharex=ax_mon, sharey=ax_mon)
        ax_thu = plt.subplot2grid(grid_size, (5, 2), colspan=2, rowspan=2, sharex=ax_mon, sharey=ax_mon)
        ax_fri = plt.subplot2grid(grid_size, (7, 0), colspan=2, rowspan=2, sharex=ax_mon, sharey=ax_mon)
        ax_sat = plt.subplot2grid(grid_size, (7, 2), colspan=2, rowspan=2, sharex=ax_mon, sharey=ax_mon)
        ax_sun = plt.subplot2grid(grid_size, (9, 1), colspan=2, rowspan=2, sharex=ax_mon, sharey=ax_mon)
        
        ax1w.set_title("Eigenvector 1 Coefficients by Day & Hour")
        ax2w.set_title("Eigenvector 2 Coefficients by Day & Hour")
        
        ax1w.set_xlabel('Time')
        ax2w.set_xlabel('Time')
        ax1d.set_xlabel('Hour of Day')
        ax2d.set_xlabel('Hour of Day')
        
        ax1w.set_ylabel('Eig 1')
        ax2w.set_ylabel('Eig 2')
        ax1d.set_ylabel('Eig 1')
        ax2d.set_ylabel('Eig 2')
        
        ax_mon.set_xlabel('Eig 1')
        ax_tue.set_xlabel('Eig 1')
        ax_wed.set_xlabel('Eig 1')
        ax_thu.set_xlabel('Eig 1')
        ax_fri.set_xlabel('Eig 1')
        ax_sat.set_xlabel('Eig 1')
        ax_sun.set_xlabel('Eig 1')
        
        ax_mon.set_ylabel('Eig 2')
        ax_tue.set_ylabel('Eig 2')
        ax_wed.set_ylabel('Eig 2')
        ax_thu.set_ylabel('Eig 2')
        ax_fri.set_ylabel('Eig 2')
        ax_sat.set_ylabel('Eig 2')
        ax_sun.set_ylabel('Eig 2')
        
        for d in sorted(weekly_data['iso'].unique()):
            daily_data = weekly_data[weekly_data['iso'] == d]
            idx = daily_data.index
              
            ax1w.plot(daily_data['timestamp'],reduced[idx,0])
            ax2w.plot(daily_data['timestamp'],reduced[idx,1])
    
            ax1d.plot([round_to_hour(x).hour for x in daily_data['timestamp']],reduced[idx,0],label=d)
            ax2d.plot([round_to_hour(x).hour for x in daily_data['timestamp']], reduced[idx,1], label=d)
        
            if d[2] < 2:
                ax_mon.plot(reduced[idx,0], reduced[idx,1], label=d)
            if d[2] < 3:
                ax_tue.plot(reduced[idx,0], reduced[idx,1], label=d)
            if d[2] < 4:
                ax_wed.plot(reduced[idx,0], reduced[idx,1], label=d)
            if d[2] < 5:
                ax_thu.plot(reduced[idx,0], reduced[idx,1], label=d)
            if d[2] < 6:
                ax_fri.plot(reduced[idx,0], reduced[idx,1], label=d)
            if d[2] < 7:
                ax_sat.plot(reduced[idx,0], reduced[idx,1], label=d)
            if d[2] < 8:
                ax_sun.plot(reduced[idx,0], reduced[idx,1], label=d)
        
        ax1w.xaxis.set_major_locator(mdates.HourLocator([0,12]))
        ax1w.xaxis.set_major_formatter(mdates.DateFormatter('%p'))
        
        cmap = matplotlib.rcParams['axes.prop_cycle']
        custom_lines = [
                Line2D([0], [0], color=cmap.by_key()['color'][0], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][1], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][2], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][3], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][4], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][5], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][6], lw=4)
        
        ]

        ax_legend.axis('off')
        ax_legend.legend(custom_lines, ['Monday', 'Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday'],loc=8,ncol=7)
      
        plt.tight_layout()
        fig.suptitle('PSN'+str(psn),verticalalignment='center',fontsize=22,y=.96)
        plt.figtext(.5,.94,'For the week of {} to {}'.format(
            weekly_data['timestamp'].iloc[0].strftime("%A, %B %d, %Y"), 
            weekly_data['timestamp'].iloc[-1].strftime("%A, %B %d, %Y")), fontsize=16, ha='center')

        fig.savefig("model{}_psn{}_weekly_movie_{}-{}.png".format(model_number, psn, w[0], w[1]))
        
        plt.close()
    

SystemError: <built-in method write of _io.BufferedWriter object at 0x000001ECA87B9258> returned a result with an error set

In [14]:
plt.rcParams["figure.figsize"] = (15,15)

for psn in data['psn'].sort_values().unique():
    psn_data = data[(data['psn'] == psn)]
    psn_data['iso'] = psn_data['timestamp'].apply(lambda x: x.isocalendar())
    psn_data['week'] = psn_data['timestamp'].apply(lambda x: (x.isocalendar()[0],x.isocalendar()[1]))
   
    complete_days = (psn_data.groupby(by=['iso']).count()['id'] > 22)
    psn_data['complete_day'] = psn_data['iso'].apply(lambda x: complete_days[x])
    psn_data = psn_data[psn_data['complete_day'] == True]
    
    #complete_weeks = (psn_data.groupby(by=['week']).count()['id'] >= 160)
    #psn_data['complete_week'] = psn_data['week'].apply(lambda x: complete_weeks[x])
    #psn_data = psn_data[psn_data['complete_week'] == True]
    fig = plt.figure(0)
        
    grid_size = (2,2)
    ax1 = plt.subplot2grid(grid_size, (0, 0))
    ax2 = plt.subplot2grid(grid_size, (0, 1))
    ax3 = plt.subplot2grid(grid_size, (1, 0))
    ax4 = plt.subplot2grid(grid_size, (1, 1))
   
    for d in psn_data['iso'].unique():
        daily_data = psn_data[psn_data['iso'] == d]
        idx = daily_data.index

        if (psn == 39) & (max(reduced[idx,1]) > 4):
            print(daily_data['timestamp'])
        
        ax1.plot([round_to_hour(x).hour for x in daily_data['timestamp']],reduced[idx,0],alpha=0.1, color='red')
        ax2.plot([round_to_hour(x).hour for x in daily_data['timestamp']],reduced[idx,1],alpha=0.1, color='red')
        ax3.plot([round_to_hour(x).hour for x in daily_data['timestamp']],reduced[idx,2],alpha=0.1, color='red')
        ax4.plot([round_to_hour(x).hour for x in daily_data['timestamp']],reduced[idx,3],alpha=0.1, color='red')
          
          
    ax1.set_title("Eigenvector 1")
    ax2.set_title("Eigenvector 2")
    ax3.set_title("Eigenvector 3")
    ax4.set_title("Eigenvector 4")
    
    ax1.set_xlabel('Hour of Day')
    ax2.set_xlabel('Hour of Day')
    ax3.set_xlabel('Hour of Day')
    ax4.set_xlabel('Hour of Day')

    
    fig.suptitle('PSN'+str(psn),verticalalignment='center',fontsize=16,y=.93)
    #fig.autofmt_xdate()
    fig.savefig("model{}_psn{}_eig_24hours.png".format(model_number, psn))

    plt.close()
 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


157110   2017-09-09 00:00:00.000000
157111   2017-09-09 00:59:59.999996
157112   2017-09-09 02:00:00.000003
157113   2017-09-09 03:00:00.000000
157114   2017-09-09 03:59:59.999996
157115   2017-09-09 05:00:00.000003
157116   2017-09-09 06:00:00.000000
157117   2017-09-09 06:59:59.999996
157118   2017-09-09 08:00:00.000003
157119   2017-09-09 09:00:00.000000
157120   2017-09-09 09:59:59.999996
157121   2017-09-09 11:00:00.000003
157122   2017-09-09 12:00:00.000000
157123   2017-09-09 12:59:59.999996
157124   2017-09-09 14:00:00.000003
157125   2017-09-09 15:00:00.000000
157126   2017-09-09 15:59:59.999996
157127   2017-09-09 17:00:00.000003
157128   2017-09-09 18:00:00.000000
157129   2017-09-09 18:59:59.999996
157130   2017-09-09 20:00:00.000003
157131   2017-09-09 21:00:00.000000
157132   2017-09-09 21:59:59.999996
157133   2017-09-09 23:00:00.000003
Name: timestamp, dtype: datetime64[ns]
