In [1]:
#Import packages of interest
import pandas as pd
import numpy as np
import datetime
pd.options.display.max_columns=300
pd.options.display.max_rows=100

## Reading in the data

In [8]:
model2_10min = pd.read_csv('../../../data/raw_data_model2_10min.csv')

## convert timestamp column to timestamps
model2_10min['timestamp'] = model2_10min['timestamp'].apply(lambda x: pd.Timestamp(x))
print('Shape of model2 10min data: ', model2_10min.shape)

Shape of model1 10min data:  (1501927, 146)
Shape of model2 10min data:  (1602326, 76)


In [9]:
## sort by psn and timestamp
model2_10min = model2_10min.sort_values(by=['psn','timestamp'],ascending=[True,True])

In [17]:
class gap_analyzer(object):
    """
    Analyzes gaps in the raw data.
    Class functions assume dataframe has a 'psn' column and 'timestamp' column. 
    class functions don't look for gaps in 'psn' column since that col was defined by us and should be there
    wherever there is any other data.
    
    
    """
    
    def __init__(self,df,time_resolution):
        self.rawdf = df
        self.psn_col = 'psn'
        self.timestamp_col = 'timestamp'
        self.time_res = time_resolution
        self.psns = None
        self.total_timestamps = None
        self.gap_summary = None
        self.counts = None
        
        
    def _analysis_preprocess(self):
        try:
            self.rawdf = self.rawdf.sort_values(by=self.psn_col,ascending=True)
        except:
            raise Exception('Psn column not detected. Please designate a column using class.psn_col')
        try:
            self.rawdf = self.rawdf.sort_values(by=self.timestamp_col,ascending=True)
        except:
            raise Exception('Timestamp column not detected. Please designate a column using class.timestamp_col')

    def _calculate_total_timestamps(self):
        
        total_timestamps = pd.DataFrame(index = self.psns, columns = ['num_timestamps'])
        if self.time_res in ['10min','10 min','10m']:
            for j in self.psns:
                subset_df = self.rawdf[self.rawdf[self.psn_col]==j]
                timerange = subset_df.iloc[-1][self.timestamp_col] - subset_df.iloc[0][self.timestamp_col]
                total_timestamps.at[j,'num_timestamps'] = timerange.days*144 + int(timerange.seconds/600)+1
                
        if self.time_res in ['1hr','1 hr','1 hour']:
            for j in self.psns:
                subset_df = self.rawdf[self.rawdf[self.psn_col]==j]
                timerange = subset_df.iloc[-1][self.timestamp_col] - subset_df.iloc[0][self.timestamp_col]
                total_timestamps.at[j,'num_timestamps'] = timerange.days*24 + int(timerange.seconds/3600)+1
        
        return(total_timestamps)
        
    def analyze_gap(self):
        ### dataframe inputted needs valid timestamp col and psn column. returns psns as the index.
        
        self._analysis_preprocess()
#         def gap_analysis(df,timestampcol,data_res):
        ## feed this a whole dataframe. make sure psns are in the "psn" column. 
        self.psns = self.rawdf[self.psn_col].unique()
        self.gap_summary = pd.DataFrame(columns=self.rawdf.columns.values)
        self.total_timestamps = self._calculate_total_timestamps()
        self.counts = self.rawdf.groupby(self.psn_col).count()
        
        for j in self.psns:
#             subset_df = self.rawdf[self.rawdf[self.psn_col]==j]
            percent_data = self.counts.loc[j]/self.total_timestamps.loc[j]['num_timestamps']
#             percent_data.name = int(j)
            self.gap_summary = self.gap_summary.append(percent_data)
        return(self.gap_summary)

        
    def availability_by_package(self):
        if isinstance(self.gap_summary,pd.DataFrame) == False:
            self.analyze_gap()
        avail_bypsn = pd.DataFrame(columns = ['Average percent available','total possible timestamps'])
        for psn in self.psns:
            avail_bypsn.at[psn,'Average percent available'] = self.gap_summary.loc[psn].mean()
            avail_bypsn.at[psn,'Avg available datapoints'] = self.counts.loc[psn].mean()
            avail_bypsn.at[psn,'total possible timestamps'] = self.total_timestamps.loc[psn]['num_timestamps']
            
        return(avail_bypsn.sort_values(by='Average percent available',ascending=False))

    def availability_by_column(self):
        if isinstance(self.gap_summary,pd.DataFrame) == False:
            self.analyze_gap()
        avail_bycol = pd.DataFrame(columns = ['Average percent available','Median percent available','Min percent available','Max percent available'])
        for col in self.gap_summary.columns.values:
            avail_bycol.at[col,'Average percent available'] = self.gap_summary[col].mean()
            avail_bycol.at[col,'Median percent available'] = self.gap_summary[col].median()
            avail_bycol.at[col,'Min percent available'] = self.gap_summary[col].min()
            avail_bycol.at[col,'Max percent available'] = self.gap_summary[col].max()
        return (avail_bycol.sort_values(by='Average percent available',ascending=False))
        

## Analyze the availability of data

In [97]:
GA = gap_analyzer(model2_10min,time_resolution='10min')

### Percent of data available per package###

this is calculated by 

\begin{equation*}
\frac{data\ points\ available}{data\ points\ expected} 
\end{equation*}

where data points expected is calculated by 

\begin{equation*}
 time\ resolution * (last\ timestamp - first\ timestamp) 
\end{equation*}

In [98]:
GA.availability_by_package().sort_index()

Unnamed: 0,Average percent available,total possible timestamps,Avg available datapoints
34,0.980753,105263,103237.0
35,0.686635,103713,71213.0
36,0.677998,103692,70303.0
37,0.859407,76405,65663.0
38,0.955342,31439,30035.0
39,0.961127,23255,22351.0
40,0.848596,16532,14028.986667
41,0.93685,16532,15488.0
42,0.910359,104907,95503.0
45,0.647112,75344,48756.0


In [99]:
GA.availability_by_column().head()

Unnamed: 0,Average percent available,Median percent available,Min percent available,Max percent available
id,0.782368,0.823327,0.296818,0.996257
perf_pow,0.782368,0.823327,0.296818,0.996257
sum_esn,0.782368,0.823327,0.296818,0.996257
pcd,0.782368,0.823327,0.296818,0.996257
ngp,0.782368,0.823327,0.296818,0.996257


In [101]:
GA.gap_summary

Unnamed: 0,id,lo_c_dp1,f_c_dp1,f_c_dp2,f_c_dp5,pe_c_dt1,g_c_dt1,g_c_dt2,lo_c_dt5,c_dt5_1,c_dt5_2,c_dt5_3,c_dt5_4,c_dt5_5,c_dt5_6,sc_c_pct_e1,pe_c_pos_e1,f_c_pos_e2,f_c_pos_e1,c_c_t5_1,c_c_t5_2,c_c_t5_3,t5_s1,pe_p1,f_cmd1,pe_t1,perf_pow,sum_eng_h,lo_p1,b_p1,g_t1,t5_6,g_t5,g_t4,v_d_1b,g_cur1,g_cur4,g_cur3,lo_t9,f_cmd2,lo_t6,t5_5,t5_2,t5_3,t5_1,ngp,nt5,pcd,v_acc1,sc_pct2,g_pow1,pe_for1,sc_pct1,pe_cmd1,pe_pos1,g_cur2,t1_1,sum_esn,t5_4,f_p2,t5_a,sum_eng_st,lo_dp1,v_d_3b,f_p7,f_p1,v_d_2b,lo_t5,g_t2,f_t1,sum_enr,g_t3,g_pct1,f_pos1,timestamp,psn
34,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,0.980753,
48,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,0.883672,
49,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,0.850859,
42,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,0.910359,
35,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,0.686635,
36,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,0.677998,
68,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,0.87103,
55,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,0.820712,
56,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,0.855218,
72,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,0.709555,
