In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics

from sqlalchemy import create_engine
from datetime import datetime, timedelta
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
import matplotlib.cm as cm
import matplotlib
from TurbineTimeSeries.storage import MachineDataStore
#from TurbineTimeSeries.transformations import PCA, StandardScaler, DropCols, DropSparseCols, JoinDataFrames

In [None]:
model_number = 2

store = MachineDataStore('.config')

model_data_hr = (store.query(model_number,'1hr')
                 .not_null(['timestamp','psn'])
                 .exclude_psn([44,52,54,70])
                 .execute())

model_data_min = (store.query(model_number,'10min')
                  .not_null(['timestamp','psn'])
                  .exclude_psn([44,52,54,70])
                  .execute())

model_data_min.head()

In [None]:
freq = '10min'
model_data = model_data_min
len(model_data)

In [None]:
skipped_cols = ['sum_esn','sum_eng_st', 'sum_eng_h']
index_cols = ['id','timestamp','psn']
data_cols = [c for c in model_data.columns if (c not in index_cols) and (c not in skipped_cols)]

In [None]:
missing_values = model_data.isnull().sum().sort_values()
sparse_cols = [x for x in missing_values.index if missing_values[x] > 30000]
clean_data_cols = [x for x in data_cols if x not in sparse_cols]

In [None]:
data = model_data[index_cols + clean_data_cols].dropna().reset_index()
clean_data = StandardScaler().fit_transform(data[clean_data_cols])

pca =  PCA().fit(clean_data)
reduced = pca.transform(clean_data)
print(data.head())

In [None]:
def round_to_hour(dt):
    dt_start_of_hour = dt.replace(minute=0, second=0, microsecond=0)
    dt_half_hour = dt.replace(minute=30, second=0, microsecond=0)

    if dt >= dt_half_hour:
        # round up
        dt = dt_start_of_hour + timedelta(hours=1)
    else:
        # round down
        dt = dt_start_of_hour

    return dt

In [None]:
plt.rcParams["figure.figsize"] = (15,35)

for psn in data['psn'].sort_values().unique():
    psn_data = pd.DataFrame(data[(data['psn'] == psn)])
    min_count = 22 if freq == '1hr' else 144 if freq == '10min' else 0
    
    psn_data['iso'] = psn_data['timestamp'].apply(lambda x: x.isocalendar())
    psn_data['week'] = psn_data['timestamp'].apply(lambda x: (x.isocalendar()[0],x.isocalendar()[1]))
   
    complete_days = (psn_data.groupby(by=['iso']).count()['id'] >= min_count)
    psn_data['complete_day'] = psn_data['iso'].apply(lambda x: complete_days[x])
    psn_data = psn_data[psn_data['complete_day'] == True]
    
    complete_weeks = (psn_data.groupby(by=['week']).count()['id'] >= min_count*7)
    psn_data['complete_week'] = psn_data['week'].apply(lambda x: complete_weeks[x])
    psn_data = psn_data[psn_data['complete_week'] == True]
   
    print(len(psn_data))
    for w in psn_data['week'].unique():  
        weekly_data = psn_data[psn_data['week'] == w]
        weekly_data =  weekly_data.sort_values(by=['timestamp'])
         
        fig = plt.figure(0)
        
        grid_size = (4,4)
        ax_legend = plt.subplot2grid(grid_size,(0,0), colspan=4)
        ax1w = plt.subplot2grid(grid_size, (1, 0), colspan=2)
        ax2w = plt.subplot2grid(grid_size, (1, 2), colspan=2, sharex=ax1w)
        ax1d = plt.subplot2grid(grid_size, (2, 0), colspan=2)
        ax2d = plt.subplot2grid(grid_size, (2, 2), colspan=2, sharex=ax1d)
        
        ax_mon = plt.subplot2grid(grid_size, (3, 0), colspan=4, rowspan=2)
       
        ax1w.set_title("Eigenvector 1 Coefficients by Day & Hour")
        ax2w.set_title("Eigenvector 2 Coefficients by Day & Hour")
        
        ax1w.set_xlabel('Time')
        ax2w.set_xlabel('Time')
        ax1d.set_xlabel('Hour of Day')
        ax2d.set_xlabel('Hour of Day')
        
        ax1w.set_ylabel('Eig 1')
        ax2w.set_ylabel('Eig 2')
        ax1d.set_ylabel('Eig 1')
        ax2d.set_ylabel('Eig 2')
        
        ax_mon.set_xlabel('Eig 1')
        ax_mon.set_ylabel('Eig 2')

        
        for d in sorted(weekly_data['iso'].unique()):
            daily_data = (weekly_data[weekly_data['iso'] == d]).sort_values(by='timestamp')
            idx = daily_data.index
              
            ax1w.plot(daily_data['timestamp'],reduced[idx,0])
            ax2w.plot(daily_data['timestamp'],reduced[idx,1])
    
            ax1d.plot([x.time() for x in daily_data['timestamp']],reduced[idx,0],label=d)
            ax2d.plot([x.time() for x in daily_data['timestamp']], reduced[idx,1], label=d)
        
            ax_mon.plot(reduced[idx,0], reduced[idx,1], label=d)
        
        ax1w.xaxis.set_major_locator(mdates.HourLocator([0,12]))
        ax1w.xaxis.set_major_formatter(mdates.DateFormatter('%p'))
        
        cmap = matplotlib.rcParams['axes.prop_cycle']
        custom_lines = [
                Line2D([0], [0], color=cmap.by_key()['color'][0], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][1], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][2], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][3], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][4], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][5], lw=4),
                Line2D([0], [0], color=cmap.by_key()['color'][6], lw=4)
        ]

        ax_legend.axis('off')
        ax_legend.legend(custom_lines, ['Monday', 'Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday'],loc=8,ncol=7)
      
        plt.tight_layout()
        fig.suptitle('PSN'+str(psn),verticalalignment='center',fontsize=22,y=.96)
        plt.figtext(.5,.94,'For the week of {} to {}'.format(
            weekly_data['timestamp'].iloc[0].strftime("%A, %B %d, %Y"), 
            weekly_data['timestamp'].iloc[-1].strftime("%A, %B %d, %Y")), fontsize=16, ha='center')

        fig.savefig("model{}_psn{}_{}_weekly_movie_{}-{}.png".format(model_number, psn, freq, w[0], w[1]))
        
        plt.close()
    

In [None]:
plt.rcParams["figure.figsize"] = (15,15)
to_be_clustered = []

for psn in data['psn'].sort_values().unique():
    psn_data = data[(data['psn'] == psn)]
    psn_data['iso'] = psn_data['timestamp'].apply(lambda x: x.isocalendar())
    psn_data['week'] = psn_data['timestamp'].apply(lambda x: (x.isocalendar()[0],x.isocalendar()[1]))
    psn_data['time'] = psn_data['timestamp'].apply(lambda x: x.time())

    complete_days = (psn_data.groupby(by=['iso']).count()['id'] == 144)
    psn_data['complete_day'] = psn_data['iso'].apply(lambda x: complete_days[x])
    psn_data = psn_data[psn_data['complete_day'] == True]
    
    #complete_weeks = (psn_data.groupby(by=['week']).count()['id'] >= 160)
    #psn_data['complete_week'] = psn_data['week'].apply(lambda x: complete_weeks[x])
    #psn_data = psn_data[psn_data['complete_week'] == True]
    fig = plt.figure(0)
        
    grid_size = (1,1)
    ax1 = plt.subplot2grid(grid_size, (0, 0))
    #ax2 = plt.subplot2grid(grid_size, (0, 1))
    #ax3 = plt.subplot2grid(grid_size, (1, 0))
    #ax4 = plt.subplot2grid(grid_size, (1, 1))
   
    for d in psn_data['iso'].unique():
        daily_data = (psn_data[psn_data['iso'] == d]).sort_values(by='time')
        idx = daily_data.index

        if (psn == 39) & (max(reduced[idx,1]) > 4):
            print(daily_data['timestamp'])
        
        to_be_clustered.append([x for x in reduced[idx,0]])
        #ax1.plot(daily_data['time'],reduced[idx,0],alpha=0.05, color='red')
        #ax2.plot(daily_data['time'],reduced[idx,1],alpha=0.05, color='red')
        #ax3.plot(daily_data['time'],reduced[idx,2],alpha=0.05, color='red')
        #ax4.plot(daily_data['time'],reduced[idx,3],alpha=0.05, color='red')
          
          
    #ax1.set_title("Eigenvector 1")
    #ax2.set_title("Eigenvector 2")
    #ax3.set_title("Eigenvector 3")
    #ax4.set_title("Eigenvector 4")
    
    #ax1.set_xlabel('Time of Day')
    #ax2.set_xlabel('Hour of Day')
    #ax3.set_xlabel('Hour of Day')
    #ax4.set_xlabel('Hour of Day')

    
    #fig.suptitle('PSN'+str(psn),verticalalignment='center',fontsize=16,y=.93)
    #fig.autofmt_xdate()
    #fig.savefig("model{}_psn{}_{}_eig_24hours.png".format(model_number, psn, freq))

    plt.close()
 

In [None]:
plt.rcParams["figure.figsize"] = (15,15)

for psn in data['psn'].sort_values().unique():
    psn_data = data[(data['psn'] == psn)]
    psn_data['iso'] = psn_data['timestamp'].apply(lambda x: x.isocalendar())
    psn_data['week'] = psn_data['timestamp'].apply(lambda x: (x.isocalendar()[0],x.isocalendar()[1]))
   
    complete_days = (psn_data.groupby(by=['iso']).count()['id'] > 22)
    psn_data['complete_day'] = psn_data['iso'].apply(lambda x: complete_days[x])
    psn_data = psn_data[psn_data['complete_day'] == True]
    
    #complete_weeks = (psn_data.groupby(by=['week']).count()['id'] >= 160)
    #psn_data['complete_week'] = psn_data['week'].apply(lambda x: complete_weeks[x])
    #psn_data = psn_data[psn_data['complete_week'] == True]
    fig = plt.figure(0)
        
    grid_size = (2,2)
    ax1 = plt.subplot2grid(grid_size, (0, 0))
    ax2 = plt.subplot2grid(grid_size, (0, 1))
    ax3 = plt.subplot2grid(grid_size, (1, 0))
    ax4 = plt.subplot2grid(grid_size, (1, 1))
   
    for w in psn_data['week'].unique():
        weekly_data = psn_data[psn_data['week'] == w]
        idx = weekly_data.index

        ax1.plot(weekly_data['timestamp'],reduced[idx,0],alpha=0.1, color='red')
        ax2.plot(weekly_data['timestamp'],reduced[idx,1],alpha=0.1, color='red')
        ax3.plot(weekly_data['timestamp'],reduced[idx,2],alpha=0.1, color='red')
        ax4.plot(weekly_data['timestamp'],reduced[idx,3],alpha=0.1, color='red')
          
          
    ax1.set_title("Eigenvector 1")
    ax2.set_title("Eigenvector 2")
    ax3.set_title("Eigenvector 3")
    ax4.set_title("Eigenvector 4")
    
    ax1.set_xlabel('Hour of Day')
    ax2.set_xlabel('Hour of Day')
    ax3.set_xlabel('Hour of Day')
    ax4.set_xlabel('Hour of Day')

    
    fig.suptitle('PSN'+str(psn),verticalalignment='center',fontsize=16,y=.93)
    #fig.autofmt_xdate()
    fig.savefig("model{}_psn{}_eig_7days.png".format(model_number, psn))

    plt.close()

In [None]:
#inertias = []
#silhouette_scores = []
#calinski_harabaz_scores=[]

n_clusters=225

cluster = KMeans(n_clusters)
cluster.fit(to_be_clustered)

plt.rcParams["figure.figsize"] = (15,15)
pd.DataFrame(cluster.labels_).hist(bins=n_clusters) 
plt.show()

grid_size = (n_clusters,1)
cluster_means = []

#inertias.append(cluster.inertia_)
#silhouette_scores.append(metrics.silhouette_score(to_be_clustered, cluster.labels_, metric='euclidean'))
#calinski_harabaz_scores.append(metrics.calinski_harabaz_score(to_be_clustered, cluster.labels_))
fig = plt.figure(0)
row = -1
shared_ax = None

for i in range(n_clusters):
    cluster_data = [to_be_clustered[j] for j in range(len(cluster.labels_)) if cluster.labels_[j] == i]
    cluster_means.append(pd.DataFrame(cluster_data).mean())
    col = i%15
    row = row if col > 0 else row+1
    ax = plt.subplot2grid((15,15), (row,col),sharey=shared_ax)
    ax.plot(pd.DataFrame(cluster_data).T,alpha=0.2, color='red')
    
    if shared_ax is None:
        shared_ax = ax     
    
    plt.axis('off')
    
fig.savefig("model{}_kmeans_{}_cluster_grid.png".format(model_number, n_clusters,i))
plt.close()


#fig = plt.figure(0)
#ax = plt.subplot2grid((1,1), (0, 0))
#ax.plot(range(10,200,10), inertias)
#plt.show()

#fig = plt.figure(0)
#ax = plt.subplot2grid((1,1), (0, 0))
#ax.plot(range(10,200,10), silhouette_scores)
#plt.show()

#fig = plt.figure(0)
#ax = plt.subplot2grid((1,1), (0, 0))
#ax.plot(range(10,200,10), calinski_harabaz_scores)
#plt.show()

In [None]:
fig = plt.figure(0)
ax = plt.subplot2grid((1,1), (0, 0))
ax.plot(range(10,200,10), inertias)
plt.show()

fig = plt.figure(0)
ax = plt.subplot2grid((1,1), (0, 0))
ax.plot(range(10,200,10), silhouette_scores)
plt.show()

fig = plt.figure(0)
ax = plt.subplot2grid((1,1), (0, 0))
ax.plot(range(10,200,10), calinski_harabaz_scores)
plt.show()