## Data Importation


In [1]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import datetime
import time
from math import floor
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, Flatten, Activation, MaxPooling2D
import matplotlib.pylab as plt
import seaborn as sns

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
%cd ..
import models.scripts.logistic_regression_functions as lrf
import models.scripts.helper_functions as hf

C:\git\senior_design\pannts-ml\notebooks


In [3]:
# Load the data
df = pd.read_csv('../data/cleaned/utah_2017_vineyard.csv')
target_col = 'BGA-Phycocyanin RFU'
df.dtypes

Date (mm.dd.yyyy)       object
Time 24hr               object
Temp C                 float64
Sp Cond (uS/cm)          int64
pH (mV)                float64
pH                     float64
Turbidity (NTU)        float64
Chlorophyll (ug/L)     float64
Chlorophyll RFU        float64
ODOSat%                float64
ODO (mg/L)             float64
BGA-Phycocyanin RFU    float64
dtype: object

In [4]:
# Create a single datetime column from the strings provided in our csv's
# (We're already doing this bit for our models)

timestamp = df['Date (mm.dd.yyyy)'] + ' '+ df['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df['timestamp'] = timestamp

# I'll drop the other columns for simple demo purposes
df = df[['timestamp', 'pH', 'BGA-Phycocyanin RFU']]
df['temp'] = [x for x in range(0, df.shape[0])]

df.dtypes

timestamp              datetime64[ns]
pH                            float64
BGA-Phycocyanin RFU           float64
temp                            int64
dtype: object

In [5]:
# Now we set our datetime to be the index of the df for awesome indexing options
# Note, the column is removed, so it might be good to keep a copy by another
# name for feature engineering.
df['datetime'] = df['timestamp']
df = df.set_index('timestamp')
df.dtypes


pH                            float64
BGA-Phycocyanin RFU           float64
temp                            int64
datetime               datetime64[ns]
dtype: object

In [6]:
# Index with an offset
start = df['datetime'][0]
delta = pd.Timedelta(1, unit='h')
print("start:", start)
print(delta)
offset = pd.Timedelta(1, 'm')
# grab a slice over 1 hour
df[start:start + delta]

start: 2017-05-05 00:00:00
0 days 01:00:00


Unnamed: 0_level_0,pH,BGA-Phycocyanin RFU,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,0.4,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,0.4,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,0.4,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,0.4,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,0.4,4,2017-05-05 01:00:00


## Quantile testing
### Numeric tests

In [7]:
# Default Behavior
test = df[start:start + delta]['temp']
print("0%", test.quantile(0.0))
print("5%", test.quantile(.05))
print("25%", test.quantile(0.25))
print("50%", test.quantile(0.5))
print("75%", test.quantile(0.75))
print("95%", test.quantile(.95))
print("100%", test.quantile(1))
df[start:start + delta].head()

0% 0.0
5% 0.2
25% 1.0
50% 2.0
75% 3.0
95% 3.8
100% 4.0


Unnamed: 0_level_0,pH,BGA-Phycocyanin RFU,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,0.4,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,0.4,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,0.4,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,0.4,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,0.4,4,2017-05-05 01:00:00


In [8]:
# Interpolation behavior
test = df[start:start + delta]['temp']
print("0%", test.quantile(0.0, interpolation='nearest'))
print("5%", test.quantile(.05, interpolation='nearest'))
print("25%", test.quantile(0.25, interpolation='nearest'))
print("50%", test.quantile(0.5, interpolation='nearest'))
print("75%", test.quantile(0.75, interpolation='nearest'))
print("95%", test.quantile(.95, interpolation='nearest'))
print("100%", test.quantile(1, interpolation='nearest'))
df[start:start + delta].head()

0% 0
5% 0
25% 1
50% 2
75% 3
95% 4
100% 4


Unnamed: 0_level_0,pH,BGA-Phycocyanin RFU,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,0.4,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,0.4,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,0.4,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,0.4,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,0.4,4,2017-05-05 01:00:00


### Datetime tests

In [9]:
# Default date behavior
test = df[start:start + delta]['datetime']
print("0%", test.quantile(0.0))
print("5%", test.quantile(.05))
print("25%", test.quantile(0.25))
print("50%", test.quantile(0.5))
print("75%", test.quantile(0.75))
print("95%", test.quantile(.95))
print("100%", test.quantile(1))
df[start:start + delta].head()

0% 2017-05-05 00:00:00
5% 2017-05-05 00:03:00
25% 2017-05-05 00:15:00
50% 2017-05-05 00:30:00
75% 2017-05-05 00:45:00
95% 2017-05-05 00:57:00
100% 2017-05-05 01:00:00


Unnamed: 0_level_0,pH,BGA-Phycocyanin RFU,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,0.4,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,0.4,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,0.4,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,0.4,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,0.4,4,2017-05-05 01:00:00


In [10]:
# Interpolation date behavior
test = df[start:start + delta]['datetime']
print("0%", test.quantile(0.0, interpolation='nearest'))
print("5%", test.quantile(.05, interpolation='nearest'))
print("25%", test.quantile(0.25, interpolation='nearest'))
print("50%", test.quantile(0.5, interpolation='nearest'))
print("75%", test.quantile(0.75, interpolation='nearest'))
print("95%", test.quantile(.95, interpolation='nearest'))
print("100%", test.quantile(1, interpolation='nearest'))
df[start:start + delta].head()

0% 2017-05-05 00:00:00
5% 2017-05-05 00:00:00
25% 2017-05-05 00:15:00
50% 2017-05-05 00:30:00
75% 2017-05-05 00:45:00
95% 2017-05-05 01:00:00
100% 2017-05-05 01:00:00


Unnamed: 0_level_0,pH,BGA-Phycocyanin RFU,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,0.4,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,0.4,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,0.4,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,0.4,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,0.4,4,2017-05-05 01:00:00


### numeric Category tests

In [11]:
# Default category behavior
test = df[start:start + delta]['temp'].astype('category')
print("0%", test.quantile(0.0))
print("5%", test.quantile(.05))
print("25%", test.quantile(0.25))
print("50%", test.quantile(0.5))
print("75%", test.quantile(0.75))
print("95%", test.quantile(.95))
print("100%", test.quantile(1))
df[start:start + delta].head()

0% 0.0
5% 0.2
25% 1.0
50% 2.0
75% 3.0
95% 3.8
100% 4.0


Unnamed: 0_level_0,pH,BGA-Phycocyanin RFU,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,0.4,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,0.4,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,0.4,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,0.4,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,0.4,4,2017-05-05 01:00:00


In [12]:
# Interpolated category behavior
test = df[start:start + delta]['temp'].astype('category')
print("0%", test.quantile(0.0, interpolation='nearest'))
print("5%", test.quantile(.05, interpolation='nearest'))
print("25%", test.quantile(0.25, interpolation='nearest'))
print("50%", test.quantile(0.5, interpolation='nearest'))
print("75%", test.quantile(0.75, interpolation='nearest'))
print("95%", test.quantile(.95, interpolation='nearest'))
print("100%", test.quantile(1, interpolation='nearest'))
df[start:start + delta].head()

0% 0
5% 0
25% 1
50% 2
75% 3
95% 4
100% 4


Unnamed: 0_level_0,pH,BGA-Phycocyanin RFU,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,0.4,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,0.4,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,0.4,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,0.4,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,0.4,4,2017-05-05 01:00:00


### String or non-numeric category tests

In [23]:
# Default string category behavior
test = df[start:start + delta]
test['temp'] = test['temp'].apply(lambda x: str(x)).astype('category')
print("0%", test[['temp']].quantile(0.0, numeric_only=False))
print("5%", test[['temp']].quantile(.05, numeric_only=False))
print("25%", test[['temp']].quantile(0.25, numeric_only=False))
print("50%", test[['temp']].quantile(0.5, numeric_only=False))
print("75%", test[['temp']].quantile(0.75, numeric_only=False))
print("95%", test[['temp']].quantile(.95, numeric_only=False))
print("100%", test[['temp']].quantile(1, numeric_only=False))
df[start:start + delta].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


TypeError: can't multiply sequence by non-int of type 'float'

In [24]:
# Interprolated string category behavior
test = df[start:start + delta]
test['temp'] = test['temp'].apply(lambda x: str(x)).astype('category')
print("0%", test[['temp']].quantile(0.0, numeric_only=False, interpolation='nearest'))
print("5%", test[['temp']].quantile(.05, numeric_only=False, interpolation='nearest'))
print("25%", test[['temp']].quantile(0.25, numeric_only=False, interpolation='nearest'))
print("50%", test[['temp']].quantile(0.5, numeric_only=False, interpolation='nearest'))
print("75%", test[['temp']].quantile(0.75, numeric_only=False, interpolation='nearest'))
print("95%", test[['temp']].quantile(.95, numeric_only=False, interpolation='nearest'))
print("100%", test[['temp']].quantile(1, numeric_only=False, interpolation='nearest'))
df[start:start + delta].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


ValueError: Wrong number of dimensions. values.ndim != ndim [2 != 1]

### It appears that we can interpolate categorical data as long as the category is still stored as a float/int


### Test average category

In [57]:
df = pd.DataFrame({'test':[9, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3]})
df['test'] = df['test'].astype('category')
print(df.dtypes)
df.head(10)

test    category
dtype: object


Unnamed: 0,test
0,9
1,1
2,1
3,1
4,1
5,1
6,2
7,2
8,3
9,3


In [58]:
df.quantile(.5, interpolation='nearest', numeric_only=False)

ValueError: Wrong number of dimensions. values.ndim != ndim [2 != 1]

### Test min category

In [59]:
df.quantile(0.0, interpolation='nearest', numeric_only=False)

ValueError: Wrong number of dimensions. values.ndim != ndim [2 != 1]

### Test max category


In [60]:
df.quantile(1.0, interpolation='nearest', numeric_only=False)

ValueError: Wrong number of dimensions. values.ndim != ndim [2 != 1]

#### It appears quantile does not select an average category. And all categorical values should be extracted by another means.