In [53]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

random_state = 123

In [54]:
df_full = pd.read_csv("combined_regular_clean_with_ssurgo_variables.csv")
df_full.shape

(14619, 29)

# Filter out bad longitude

In [55]:
# remove longitude > -50 (bad datapoints)

df_full = df_full[df_full.longitude < -50]
df_full.shape # 6 records removed

(14613, 29)

# Filter out bad records

- any records where the cwa_determination is contrary to expectations? That is, where none of cwa1 etc are 1 but the cwa_determination value is 1


In [56]:
# any records where the cwa_determination is contrary to expectations? That is, where none of cwa1 etc
# are 1 but the cwa_determination value is 1
good_records = (df_full.apply(lambda x: 
               (np.sum(x.cwa1 + x.cwa2 + x.cwa3 + x.cwa4 + x.cwa5 + 
                       x.cwa6 + x.cwa7 + x.cwa8 + x.cwa9) > 0) * 1 
               == x.cwa_determination, 
               axis=1))

print("%good records = {}%".format(round(np.mean(good_records) * 100)))
print("There are {}% bad records".format(100-round(np.mean(good_records) * 100)))
print(df_full.shape)
# peek at not good records to verify code
df_full[~good_records].head(3).iloc[:,10:30]


%good records = 98%
There are 2% bad records
(14613, 29)


Unnamed: 0,cwa_determination,rha1,rha2,cwa1,cwa2,cwa3,cwa4,cwa5,cwa6,cwa7,cwa8,cwa9,potential_wetland,index,Index,mukey,hydclprs,aws025wta,drclassdcd
36,1,0,0,0,0,0,0,0,0,0,0,0,1,37,37,292681.0,0.0,4.48,Moderately well drained
50,1,0,0,0,0,0,0,0,0,0,0,0,1,51,51,292995.0,96.0,4.65,Very poorly drained
56,1,0,0,0,0,0,0,0,0,0,0,0,1,57,57,292980.0,8.0,4.67,Somewhat poorly drained


In [57]:
# Retain only the good records
df_full = df_full[good_records]
df_full.shape # 291 records removed

(14322, 29)

# Drop St. Louis entirely (it has only 6 records, and is not being split in 70/15/15, and causing other issues)

In [58]:
df_full = df_full[df_full.district != "St. Louis"]

In [59]:
df_full.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'jurisdiction_type', 'da_number',
       'district', 'project_name', 'longitude', 'latitude',
       'date_issued_or_denied', 'rha_determination', 'cwa_determination',
       'rha1', 'rha2', 'cwa1', 'cwa2', 'cwa3', 'cwa4', 'cwa5', 'cwa6', 'cwa7',
       'cwa8', 'cwa9', 'potential_wetland', 'index', 'Index', 'mukey',
       'hydclprs', 'aws025wta', 'drclassdcd'],
      dtype='object')

# Split data into Train, Test1 and Test2

In [60]:
X = df_full[['da_number', 'cwa_determination']]
y = df_full.cwa_determination

In [61]:
# Split data into train and test
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=random_state)
for train_index, temp_index in sss.split(X, y):
    X_train = X.iloc[train_index]
    X_temp = X.iloc[temp_index]

print(X_train.shape, X_temp.shape)
y_train = y.iloc[train_index]
y_temp = y.iloc[temp_index]



(10021, 2) (4295, 2)


In [62]:
sss_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=random_state)
for dev_index, test_index in sss_test.split(X_temp, y_temp):
    X_dev = X_temp.iloc[dev_index]
    X_test = X_temp.iloc[test_index]

print(X_dev.shape, X_test.shape)
y_dev = y_temp.iloc[dev_index]
y_test = y_temp.iloc[test_index]

df = X_train
df_dev = X_dev
df_test = X_test

Y = np.array(y_train)
dev_Y = np.array(y_dev)
test_Y = np.array(y_test)

(2147, 2) (2148, 2)


In [63]:
train_count1 = np.sum(df.cwa_determination)
test1_count1 = np.sum(df_dev.cwa_determination)
test2_count1 = np.sum(df_test.cwa_determination)

full_avg_1 = np.mean(df_full.cwa_determination)
train_avg_1 = np.mean(df.cwa_determination)
test1_avg_1 = np.mean(df_dev.cwa_determination)
test2_avg_1 = np.mean(df_test.cwa_determination)

print("fraction of 1's in {1}: {0}".format(round(full_avg_1, 2), "complete dataset"))
print("fraction of 1's in {1}:            {0}".format(round(train_avg_1, 2), "train"))
print("fraction of 1's in {1}:            {0}".format(round(test1_avg_1, 2), "test1"))
print("fraction of 1's in {1}:            {0}".format(round(test2_avg_1, 2), "test2"))

fraction of 1's in complete dataset: 0.36
fraction of 1's in train:            0.36
fraction of 1's in test1:            0.36
fraction of 1's in test2:            0.36


In [64]:
print("Training: {} (fraction = {})".format(df.shape, round(df.shape[0] / df_full.shape[0], 2)))
print("Test1   : {} (fraction = {})".format(df_dev.shape, round(df_dev.shape[0] / df_full.shape[0], 2)))
print("Test2   : {} (fraction = {})".format(df_test.shape, round(df_test.shape[0] / df_full.shape[0], 2)))

Training: (10021, 2) (fraction = 0.7)
Test1   : (2147, 2) (fraction = 0.15)
Test2   : (2148, 2) (fraction = 0.15)


In [65]:
# pickle.dump(df, open("2021.04.06_TTT_15k_level_train_dataset","wb"), protocol=3)
# pickle.dump(df_dev, open("2021.04.06_TTT_15k_level_dev_dataset","wb"), protocol=3)
# pickle.dump(df_test, open("2021.04.06_TTT_15k_level_test_dataset","wb"), protocol=3)

In [66]:
df.columns

Index(['da_number', 'cwa_determination'], dtype='object')

In [67]:
df

Unnamed: 0,da_number,cwa_determination
9926,SAC-2019-01215,0
12599,SPL-2003-01623-KAT,0
2524,MVK-2017-00789-CR,0
6208,NAO-2016-02055-tca,1
10144,SAJ-2008-01477,1
...,...,...
4766,MVP-2017-00800-EMN,0
13974,SWL-2016-00405,0
14304,MVN-2017-00381-SK,1
13041,SWG-2011-01237,1
