# Which factories are responsible for which chemical releases? Carefully describe how you determined this using all the data you have available. For the factories you identified, describe any observed patterns of operation revealed in the data.

In [50]:
# import libraries
import numpy as np
import pandas as pd
from pandasql import sqldf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
from tabulate import tabulate
import operator
sns.set_style("darkgrid")

In [2]:
# import data
sensor = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/master/data/prepared_data/clean_data/clean_sensor_data.csv')
weather = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/ef9ac7bceb5cde6fd932cb5c0018ca90329a9d3c/data/prepared_data/clean_data/clean_meteorological_data.csv')
locations = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/ef9ac7bceb5cde6fd932cb5c0018ca90329a9d3c/data/prepared_data/clean_data/locations.csv')
chem_counts = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/a9b08f3c0f6fdd6e72bdd1465f3c6177b687d332/data/prepared_data/clean_data/chem_counts.csv')

psql = lambda q: sqldf(q, globals())

In [3]:
# eliminate the potentially faulty sensor 8
sensor = sensor.loc[sensor.Monitor != 8]

In [4]:
# determine cutoffs for each chemical type
chems = list(sensor.Chemical.unique())
chem_dfs = dict()
cutoff_values = dict()

for chemical in chems:
    chem_dfs['{name}'.format(name=chemical)] = sensor.loc[sensor.Chemical == chemical]
    
for chemical in chems:
    reference = chem_dfs[chemical]
    tmp_cutoff = np.mean(reference.Reading) + (3 * np.std(reference.Reading))
    cutoff_values['{name}'.format(name=chemical)] = round(tmp_cutoff, 4)

pprint.pprint(cutoff_values)

{'AGOC-3A': 10.6913,
 'Appluimonia': 2.699,
 'Chlorodinine': 3.3325,
 'Methylosmolene': 8.7986}


In [5]:
# isolate observations where the chemical reading is above the cutoff

spikes = dict()
for chemical in chems:
    tmp_df = chem_dfs[chemical]
    tmp_cutoff = cutoff_values[chemical]
    tmp_outliers = tmp_df.loc[tmp_df.Reading > tmp_cutoff]
    spikes['{name}_outliers'.format(name=chemical)] = tmp_outliers

In [32]:
# lets see the count of each significant chemical by sensor
for key in spikes:
    
    sigs = spikes[key][['Monitor', 'Chemical', 'Reading']] \
        .groupby(by=['Chemical', 'Monitor']).count().sort_values(by='Reading', ascending=False)
    print(tabulate(sigs, headers=['Chemical, Sensor', 'Number of Significant Readings'], tablefmt='psql'))
    print()    

+-----------------------+----------------------------------+
| Chemical, Sensor      |   Number of Significant Readings |
|-----------------------+----------------------------------|
| ('Methylosmolene', 6) |                               26 |
| ('Methylosmolene', 3) |                               16 |
| ('Methylosmolene', 4) |                               10 |
| ('Methylosmolene', 5) |                               10 |
| ('Methylosmolene', 7) |                               10 |
| ('Methylosmolene', 9) |                                5 |
| ('Methylosmolene', 2) |                                3 |
| ('Methylosmolene', 1) |                                1 |
+-----------------------+----------------------------------+

+---------------------+----------------------------------+
| Chemical, Sensor    |   Number of Significant Readings |
|---------------------+----------------------------------|
| ('Chlorodinine', 3) |                              111 |
| ('Chlorodinine', 6) |        

In [None]:
# let's find some of the instances that drive the above charts, specific perpetraitors, and track origin with wind data

In [33]:
top_10s = dict()
for key in spikes:
    top_10s['top_{name}'.format(name=key)] = spikes[key].sort_values(by='Reading', ascending=False).head(10)

In [49]:
for elem in top_10s:
    print('+---' + elem.upper() + ':\n')
    pprint.pprint(top_10s[elem])
    print('\n')

+---TOP_METHYLOSMOLENE_OUTLIERS:

             Chemical  Monitor            Date_Time    Reading
59299  Methylosmolene        6  2016-12-08 22:00:00  100.77640
6876   Methylosmolene        6  2016-04-09 01:00:00   94.34631
972    Methylosmolene        6  2016-04-02 04:00:00   88.53254
53498  Methylosmolene        6  2016-12-02 04:00:00   84.95453
53462  Methylosmolene        6  2016-12-02 03:00:00   80.54764
53534  Methylosmolene        6  2016-12-02 05:00:00   79.04791
26676  Methylosmolene        3  2016-08-01 23:00:00   75.96199
61171  Methylosmolene        6  2016-12-11 02:00:00   68.88227
56090  Methylosmolene        7  2016-12-05 04:00:00   63.31599
26822  Methylosmolene        2  2016-08-02 04:00:00   58.46353


+---TOP_CHLORODININE_OUTLIERS:

           Chemical  Monitor            Date_Time   Reading
26894  Chlorodinine        2  2016-08-02 06:00:00  15.72311
71647  Chlorodinine        6  2016-12-23 05:00:00  15.04032
67436  Chlorodinine        4  2016-12-18 08:00:00  14.58982

In [61]:
# now lets join in the weather data to that specific timestamp
# ...and do some weird unpacking using the dicts __getitem__ method so we can use some sql joins

top_app = operator.itemgetter('top_Appluimonia_outliers')(top_10s)
top_agoc = operator.itemgetter('top_AGOC-3A_outliers')(top_10s)
top_chloro = operator.itemgetter('top_Chlorodinine_outliers')(top_10s)
top_methyl = operator.itemgetter('top_Methylosmolene_outliers')(top_10s)


Unnamed: 0,Chemical,Monitor,Date_Time,Reading
36434,AGOC-3A,3,2016-08-13 09:00:00,85.31459
4500,AGOC-3A,6,2016-04-06 07:00:00,84.82578
35678,AGOC-3A,5,2016-08-12 12:00:00,84.63599
36578,AGOC-3A,3,2016-08-13 13:00:00,81.69983
56162,AGOC-3A,3,2016-12-05 06:00:00,80.4121
66103,AGOC-3A,6,2016-12-16 19:00:00,74.27616
34994,AGOC-3A,6,2016-08-11 17:00:00,71.68124
59623,AGOC-3A,6,2016-12-09 07:00:00,70.82454
4464,AGOC-3A,6,2016-04-06 06:00:00,69.16677
67471,AGOC-3A,4,2016-12-18 09:00:00,65.71898


In [58]:
top_10s.keys()

dict_keys(['top_Methylosmolene_outliers', 'top_Chlorodinine_outliers', 'top_AGOC-3A_outliers', 'top_Appluimonia_outliers'])