# **DATA ORGANIZATION**

this script will be used for organizing the data/feature engineering and writing other .csv/xslx files as needed

NOTE: the orginal datafile will not be saved in this repository as it contains confidential location information...each location will be assigned a number, and we will keep track of this list internally, however this number will not be used in the algorithms as a feature

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#imports and get raw data file
import pandas as pd
import numpy as np
import funcs
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#note that 'private_name' is the associated secret number for the different locations
df = pd.read_csv('../data/raw_data.csv')

In [3]:
'''
IMPORTANT NOTES/ASSUMPTIONS: 
- many of the tests are for other soil/water parameters (pH, electrical conductivity, etc) so we want to extract just pesticide tests...

- to be thorough, the DEC tested for numerous pesticides on each sample, many of which were not applied, resulting in lots of important 
  but unusable data where there is no detectable amount

- many farmers/pesticide appliers provided us information on which pesticides they used...the df includes a 'wasused' column that will be
  utilized to extract the usable feature...however many pesticides were detectable in cases where we did not think it was applied, so it
  is ASSUMED that the pesticide was applied somewhere in close proximity, perhaps upstream or maybe there was errors in communication with the
  farmers/pesticide appliers

- FEATURE ENGINEERING: all nan results are considered zero...the pesticide was not detected

'''
df['result'] = df['result'].fillna(0)
pd.to_numeric(df['result'])

#this contains all feature rows to be put into algorithms
df_feature_rows = df[np.logical_or(df['wasused'] != 'no', df['koc'].notnull() & df['result'] > 0, df['kfoc'].notnull() & df['result'] > 0)]
df_feature_rows = df_feature_rows[df_feature_rows['drainage_class'].notnull() & df['soil_halflife'].notnull()]
df_feature_rows = df_feature_rows[df_feature_rows['parameter'] != 'Sulfur']

  df_feature_rows = df_feature_rows[df_feature_rows['drainage_class'].notnull() & df['soil_halflife'].notnull()]


In [4]:
'''
- theoretically, the organic carbon-water partition coefficient ('koc' column) and the organic carbon-water normalized Freundlich distribution 
  coefficient will be treated as the same

- this loop combines the columns, choosing koc first if it is available
'''
pcoef = []
for idx, row in df_feature_rows.iterrows():
    if row['koc'] > 0 :
        pcoef += [float(row['koc'])]
    else :
        pcoef += [float(row['kfoc'])]

df_feature_rows['pcoef'] = pcoef

In [5]:
#extract all current columns of interest to be put into algorithms...NOT FINAL
col_list = ['private_name', 'loctype', 'aquifer_vulnerability', 'drainage_class', 'sampdate', 'parameter', 'soil_halflife', 'simphalflife', 'morehalflives', 'pcoef', 'simpsorp', 'simpsorp2', 'result', 'simpresult']



In [6]:
#get all columns of interest
df_cols = df_feature_rows.loc[:, col_list]

#replace all instances of 'well drained' to 'Well drained'
df_cols.replace(to_replace='well drained', value='Well drained', inplace = True)

#add a 'detected' column if result > 0
#1 if detected, -1 if not
for idx, row in df_cols.iterrows():
    if df_cols.loc[idx, 'result'] > 0:
        df_cols.at[idx, 'detected'] = 1
    else:
        df_cols.at[idx, 'detected'] = -1



In [7]:
#reset index
#test_y = detected[t:].reset_index().iloc[:,1:]
df_reset = df_cols.reset_index().iloc[:,1:]

In [8]:
#setup final dataframe
#for now, working with all raw numbers and not pre-decided categories
onehot_cols = ['loctype', 'aquifer_vulnerability', 'drainage_class']
raw_cols = ['soil_halflife', 'pcoef']

#normalize raw values
norm = scaler.fit_transform(df_reset.loc[:, raw_cols])
norm = round(pd.DataFrame(norm, columns = raw_cols), 3)

#onehot categorical
df_onehot = funcs.onehot(df=df_reset, columns = onehot_cols)
df_final = pd.concat([df_onehot, norm], axis = 1)

#append offset and re-add detected column
df_final['offset'] = np.ones((df_cols.shape[0]))
df_final['detected'] = df_reset['detected']


In [9]:
#write df_final as csv for future use
df_final.to_csv(path_or_buf = '../data/df_final.csv', sep = ',')