In [23]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

random_state = 123

In [24]:
df_full = pd.read_csv("combined_regular_clean_with_ssurgo_variables.csv")
df_full.shape

(14619, 29)

# Filter out bad longitude

In [25]:
# remove longitude > -50 (bad datapoints)

df_full = df_full[df_full.longitude < -50]
df_full.shape # 6 records removed

(14613, 29)

# Filter out bad records

- any records where the cwa_determination is contrary to expectations? That is, where none of cwa1 etc are 1 but the cwa_determination value is 1


In [26]:
# any records where the cwa_determination is contrary to expectations? That is, where none of cwa1 etc
# are 1 but the cwa_determination value is 1
good_records = (df_full.apply(lambda x: 
               (np.sum(x.cwa1 + x.cwa2 + x.cwa3 + x.cwa4 + x.cwa5 + 
                       x.cwa6 + x.cwa7 + x.cwa8 + x.cwa9) > 0) * 1 
               == x.cwa_determination, 
               axis=1))

print("%good records = {}%".format(round(np.mean(good_records) * 100)))
print("There are {}% bad records".format(100-round(np.mean(good_records) * 100)))
print(df_full.shape)
# peek at not good records to verify code
df_full[~good_records].head(3).iloc[:,10:30]


%good records = 98%
There are 2% bad records
(14613, 29)


Unnamed: 0,cwa_determination,rha1,rha2,cwa1,cwa2,cwa3,cwa4,cwa5,cwa6,cwa7,cwa8,cwa9,potential_wetland,index,Index,mukey,hydclprs,aws025wta,drclassdcd
36,1,0,0,0,0,0,0,0,0,0,0,0,1,37,37,292681.0,0.0,4.48,Moderately well drained
50,1,0,0,0,0,0,0,0,0,0,0,0,1,51,51,292995.0,96.0,4.65,Very poorly drained
56,1,0,0,0,0,0,0,0,0,0,0,0,1,57,57,292980.0,8.0,4.67,Somewhat poorly drained


In [27]:
# Retain only the good records
df_full = df_full[good_records]
df_full.shape # 291 records removed

(14322, 29)

# Drop St. Louis entirely (it has only 6 records, and is not being split in 70/15/15, and causing other issues)

In [28]:
df_full = df_full[df_full.district != "St. Louis"]

In [29]:
df_full.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'jurisdiction_type', 'da_number',
       'district', 'project_name', 'longitude', 'latitude',
       'date_issued_or_denied', 'rha_determination', 'cwa_determination',
       'rha1', 'rha2', 'cwa1', 'cwa2', 'cwa3', 'cwa4', 'cwa5', 'cwa6', 'cwa7',
       'cwa8', 'cwa9', 'potential_wetland', 'index', 'Index', 'mukey',
       'hydclprs', 'aws025wta', 'drclassdcd'],
      dtype='object')

# Split data into Train, Test1 and Test2

In [30]:
df_train = pd.DataFrame()
df_test1 = pd.DataFrame()
df_test2 = pd.DataFrame()

df_train_ = pd.DataFrame()
df_test1_ = pd.DataFrame()
df_test2_ = pd.DataFrame()

try:
    df_train_temp_, df_test2_ = (train_test_split(df_full, 
                                             test_size=0.15, 
                                             random_state = random_state, 
                                             stratify=df_full["cwa_determination"])) # 15% test

    df_train_, df_test1_ = (train_test_split(df_train_temp_, 
                                             test_size=0.17647, 
                                             random_state = random_state, 
                                             stratify=df_train_temp_["cwa_determination"])) # 70% train, 15% dev
except Exception as e:
    print(group[0], e)
df_train = df_train_
df_test1 = df_test1_
df_test2 = df_test2_


In [31]:
train_count1 = np.sum(df_train.cwa_determination)
test1_count1 = np.sum(df_test1.cwa_determination)
test2_count1 = np.sum(df_test2.cwa_determination)

full_avg_1 = np.mean(df_full.cwa_determination)
train_avg_1 = np.mean(df_train.cwa_determination)
test1_avg_1 = np.mean(df_test1.cwa_determination)
test2_avg_1 = np.mean(df_test2.cwa_determination)

print("fraction of 1's in {1}: {0}".format(round(full_avg_1, 2), "complete dataset"))
print("fraction of 1's in {1}:            {0}".format(round(train_avg_1, 2), "train"))
print("fraction of 1's in {1}:            {0}".format(round(test1_avg_1, 2), "test1"))
print("fraction of 1's in {1}:            {0}".format(round(test2_avg_1, 2), "test2"))

fraction of 1's in complete dataset: 0.36
fraction of 1's in train:            0.36
fraction of 1's in test1:            0.36
fraction of 1's in test2:            0.36


In [32]:
print("Training: {} (fraction = {})".format(df_train.shape, round(df_train.shape[0] / df_full.shape[0], 2)))
print("Test1   : {} (fraction = {})".format(df_test1.shape, round(df_test1.shape[0] / df_full.shape[0], 2)))
print("Test2   : {} (fraction = {})".format(df_test2.shape, round(df_test2.shape[0] / df_full.shape[0], 2)))

Training: (10020, 29) (fraction = 0.7)
Test1   : (2148, 29) (fraction = 0.15)
Test2   : (2148, 29) (fraction = 0.15)


In [33]:
pickle.dump(df_train, open("2021.04.02_TTT_15k_level_train_dataset","wb"), protocol=3)
pickle.dump(df_test1, open("2021.04.02_TTT_15k_level_test1_dataset","wb"), protocol=3)
pickle.dump(df_test2, open("2021.04.02_TTT_15k_level_test2_dataset","wb"), protocol=3)