In [1]:
%%capture
%run 01_transform.ipynb

In [2]:
# SET YOUR OWN FREQUENCY FOR TIME SERIES ANALYSIS

# try 'W-SAT' for Sun-Sat, 'M' for month end, 'D' for calendar days, etc
# full options at http://pandas.pydata.org/pandas-docs/stable/timeseries.html#anchored-offsets
config = {
    'frequency': 'W-SAT'
}

print(scf_df.shape)

(16438, 52)


In [3]:
# FILTER SCF_DF FOR ISSUES WITHIN CONFIGURABLE DATE RANGE
# optional, this just cleanly aligns our data with our frequency (eg start on a Sun, end on a Sat)

# convert object into datetime
def makeDate(obj):
    return datetime.strptime(obj, '%m-%d-%Y')

# store datetime values
a = scf_df['created_at_date'].apply(lambda x: makeDate(x))
b = makeDate('07-02-2017')
c = makeDate('08-12-2017')

# do datetime comparisons, where 'start' and 'end' are booleans
# filter for: start <= created_at_date <= end
start = b <= a
end = a <= c

# filter all issues where start and end are True, set as new dataframe
scf_df = scf_df.loc[start & end]

# check that we successfully filtered rows
print(scf_df.shape)

(7514, 52)


In [4]:
# RESAMPLE AND COUNT

# pluck out just the columns we care about
filtered_df = scf_df[['time_index', 'created_at', 'closed_at', 'reopened_at', 'canonical_issue_id']]
filtered_df.columns = ['time_index', 'total_issues_created', 'num_closed', 'num_reopened', 'num_canonical']

# set col index
filtered_df.index = filtered_df['time_index']

# count issues and aggregate
summary_df = filtered_df.resample(config['frequency']).count()
summary_df

Unnamed: 0_level_0,time_index,total_issues_created,num_closed,num_reopened,num_canonical
time_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-04-01,1033,1033,794,8,47
2017-04-08,1256,1256,990,14,43
2017-04-15,1016,1016,839,19,41
2017-04-22,1126,1126,955,13,60
2017-04-29,994,994,833,21,39
2017-05-06,955,955,703,26,48
2017-05-13,1134,1134,444,35,12


In [5]:
# RESAMPLE, GROUP BY TYPE, AND AGGREGATE

# pluck out just the columns we care about
filtered_df = scf_df[['time_index', 'request_type_title', 'created_at', 'closed_at', 'days_create_to_close', 'within_sla_bool', 'beyond_jurisdiction']]
filtered_df.columns = ['time_index', 'issue_type', 'total_issues_created', 'num_closed', 'med_days_to_close', 'num_within_sla', 'num_out_jurisdiction']

# set col index
filtered_df.index = filtered_df['time_index']

# resample the dateset by time, group by request type and aggregate/do math on other cols
metrics_df = filtered_df.groupby('issue_type').resample(config['frequency']).agg({'total_issues_created': np.count_nonzero, 'med_days_to_close': np.median, 'num_within_sla': np.sum, 'num_out_jurisdiction': np.sum})
metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_issues_created,med_days_to_close,num_within_sla,num_out_jurisdiction
issue_type,time_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abandoned Vehicle,2017-04-01,67,3.904387,38.0,1.0
Abandoned Vehicle,2017-04-08,122,4.965035,65.0,1.0
Abandoned Vehicle,2017-04-15,72,2.961985,52.0,2.0
Abandoned Vehicle,2017-04-22,82,2.978056,59.0,3.0
Abandoned Vehicle,2017-04-29,89,2.962998,65.0,3.0
Abandoned Vehicle,2017-05-06,63,2.940000,41.0,0.0
Abandoned Vehicle,2017-05-13,89,1.828634,33.0,0.0
Blocked Catch Basin,2017-04-01,225,26.538461,0.0,0.0
Blocked Catch Basin,2017-04-08,203,10.090324,0.0,0.0
Blocked Catch Basin,2017-04-15,115,15.740683,0.0,0.0


In [6]:
# export dataframe
metrics_df.transpose().to_csv('/home/jessica/Desktop/metrics_monthly.csv')

In [7]:
# pluck out just the columns we care about
another_df = scf_df[['time_index', 'request_type_title', 'created_at', 'closed_at', 'reopened_at', 'canonical_issue_id']]
another_df.columns = ['time_index', 'issue_type', 'total_issues_created', 'num_closed', 'num_reopened', 'num_canonical']

# set col index
another_df.index = another_df['time_index']

# resample the dateset by time, group by request type and aggregate/do math on other cols
more_metrics_df = another_df.groupby('issue_type').resample(config['frequency']).count()
more_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time_index,issue_type,total_issues_created,num_closed,num_reopened,num_canonical
issue_type,time_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abandoned Vehicle,2017-04-01,67,67,67,66,0,1
Abandoned Vehicle,2017-04-08,122,122,122,121,3,1
Abandoned Vehicle,2017-04-15,72,72,72,72,1,0
Abandoned Vehicle,2017-04-22,82,82,82,79,1,3
Abandoned Vehicle,2017-04-29,89,89,89,86,3,2
Abandoned Vehicle,2017-05-06,63,63,63,59,0,0
Abandoned Vehicle,2017-05-13,89,89,89,33,1,0
Blocked Catch Basin,2017-04-01,225,225,225,34,0,29
Blocked Catch Basin,2017-04-08,203,203,203,32,0,15
Blocked Catch Basin,2017-04-15,115,115,115,16,1,5


In [8]:
# export results
more_metrics_df.transpose().to_csv('/home/jessica/Desktop/metrics_monthly_2.csv')