# Which factories are responsible for which chemical releases? Carefully describe how you determined this using all the data you have available. For the factories you identified, describe any observed patterns of operation revealed in the data.

In [25]:
# import libraries
import numpy as np
import pandas as pd
from pandasql import sqldf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
sns.set_style("darkgrid")

In [5]:
# import data
sensor = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/master/data/prepared_data/clean_data/clean_sensor_data.csv')
weather = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/ef9ac7bceb5cde6fd932cb5c0018ca90329a9d3c/data/prepared_data/clean_data/clean_meteorological_data.csv')
locations = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/ef9ac7bceb5cde6fd932cb5c0018ca90329a9d3c/data/prepared_data/clean_data/locations.csv')
chem_counts = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/a9b08f3c0f6fdd6e72bdd1465f3c6177b687d332/data/prepared_data/clean_data/chem_counts.csv')

psql = lambda q: sqldf(q, globals())

In [15]:
# eliminate the potentially faulty sensor 8
sensor = sensor.loc[sensor.Monitor != 8]

In [30]:
# determine cutoffs for each chemical type
chems = list(sensor.Chemical.unique())
chem_dfs = dict()
cutoff_values = dict()

for chemical in chems:
    chem_dfs['{name}'.format(name=chemical)] = sensor.loc[sensor.Chemical == chemical]
    
for chemical in chems:
    reference = chem_dfs[chemical]
    tmp_cutoff = np.mean(reference.Reading) + (3 * np.std(reference.Reading))
    cutoff_values['{name}'.format(name=chemical)] = round(tmp_cutoff, 4)

pprint.pprint(cutoff_values)

{'AGOC-3A': 10.6913,
 'Appluimonia': 2.699,
 'Chlorodinine': 3.3325,
 'Methylosmolene': 8.7986}


In [31]:
# isolate observations where the chemical reading is above the cutoff

spikes = dict()
for chemical in chems:
    tmp_df = chem_dfs[chemical]
    tmp_cutoff = cutoff_values[chemical]
    tmp_outliers = tmp_df.loc[tmp_df.Reading > tmp_cutoff]
    spikes['{name}_outliers'.format(name=chemical)] = tmp_outliers

In [43]:
# let's take the top 10 by reading per chemical
for key in spikes.keys():
    spikes[key] = spikes[key].sort_values(by='Reading', ascending=False, axis=0).head(10)
#     df = df.sort_values(by='Reading', ascending=False, axis=0).head(10)

In [45]:
for elem in spikes:
    print(spikes[elem])
    print()
    print()

             Chemical  Monitor            Date_Time    Reading
59299  Methylosmolene        6  2016-12-08 22:00:00  100.77640
6876   Methylosmolene        6  2016-04-09 01:00:00   94.34631
972    Methylosmolene        6  2016-04-02 04:00:00   88.53254
53498  Methylosmolene        6  2016-12-02 04:00:00   84.95453
53462  Methylosmolene        6  2016-12-02 03:00:00   80.54764
53534  Methylosmolene        6  2016-12-02 05:00:00   79.04791
26676  Methylosmolene        3  2016-08-01 23:00:00   75.96199
61171  Methylosmolene        6  2016-12-11 02:00:00   68.88227
56090  Methylosmolene        7  2016-12-05 04:00:00   63.31599
26822  Methylosmolene        2  2016-08-02 04:00:00   58.46353


           Chemical  Monitor            Date_Time   Reading
26894  Chlorodinine        2  2016-08-02 06:00:00  15.72311
71647  Chlorodinine        6  2016-12-23 05:00:00  15.04032
67436  Chlorodinine        4  2016-12-18 08:00:00  14.58982
56092  Chlorodinine        7  2016-12-05 04:00:00  13.40056
22392