In [1]:
import numpy as np
import pandas as pd
from pandasql import sqldf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
# from sklearn.preprocessing import StandardScaler
sns.set_style("darkgrid")

In [3]:
sensor = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/master/data/prepared_data/clean_data/clean_sensor_data.csv')
weather = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/ef9ac7bceb5cde6fd932cb5c0018ca90329a9d3c/data/prepared_data/clean_data/clean_meteorological_data.csv')
locations = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/ef9ac7bceb5cde6fd932cb5c0018ca90329a9d3c/data/prepared_data/clean_data/locations.csv')
chem_counts = pd.read_csv('https://raw.githubusercontent.com/j-tice/capstone_final/a9b08f3c0f6fdd6e72bdd1465f3c6177b687d332/data/prepared_data/clean_data/chem_counts.csv')

psql = lambda q: sqldf(q, globals())

In [4]:
sensor.head()

Unnamed: 0,Chemical,Monitor,Date_Time,Reading
0,Methylosmolene,3,2016-04-01 00:00:00,2.68382
1,Methylosmolene,7,2016-04-01 00:00:00,2.63064
2,Chlorodinine,3,2016-04-01 00:00:00,1.25917
3,Chlorodinine,7,2016-04-01 00:00:00,0.943983
4,AGOC-3A,3,2016-04-01 00:00:00,0.722303


In [None]:
# Import the iris dataset
X = sensor['Chemical,Monitor,Reading'.split(',')]

iforest = IsolationForest(n_estimators=100, max_samples='auto', 
                          contamination=0.1, max_features=1.0, 
                          bootstrap=False, n_jobs=-1, random_state=1)

# Returns 1 of inliers, -1 for outliers
pred = iforest.fit_predict(X)

# Extract outliers
outlier_index = np.where(pred==-1)
outlier_values = X[outlier_index]

# Feature scaling 
sc=StandardScaler()
X_scaled = sc.fit_transform(X)
outlier_values_scaled = sc.transform(outlier_values)

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
outlier_values_pca = pca.transform(outlier_values_scaled)

# Plot the data
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1])
sns.scatterplot(x=outlier_values_pca[:,0], 
                y=outlier_values_pca[:,1], color='r')
plt.title("Isolation Forest Outlier Detection (Iris Data)", 
           fontsize=15, pad=15)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.savefig("Isolation Forest Detection.png", dpi=80)