In [125]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

random_state = 123

In [126]:
df_full = pd.read_csv("combined_regular_clean_with_ssurgo_variables.csv")
df_full.shape

(14619, 29)

# Filter out bad longitude

In [127]:
# remove longitude > -50 (bad datapoints)

df_full = df_full[df_full.longitude < -50]
df_full.shape # 6 records removed

(14613, 29)

# Filter out bad records

- any records where the cwa_determination is contrary to expectations? That is, where none of cwa1 etc are 1 but the cwa_determination value is 1


In [128]:
# any records where the cwa_determination is contrary to expectations? That is, where none of cwa1 etc
# are 1 but the cwa_determination value is 1
good_records = (df_full.apply(lambda x: 
               (np.sum(x.cwa1 + x.cwa2 + x.cwa3 + x.cwa4 + x.cwa5 + 
                       x.cwa6 + x.cwa7 + x.cwa8 + x.cwa9) > 0) * 1 
               == x.cwa_determination, 
               axis=1))

print("%good records = {}%".format(round(np.mean(good_records) * 100)))
print("There are {}% bad records".format(100-round(np.mean(good_records) * 100)))

# peek at not good records to verify code
df_full[~good_records].head(3).iloc[:,10:30]

%good records = 98%
There are 2% bad records


Unnamed: 0,cwa_determination,rha1,rha2,cwa1,cwa2,cwa3,cwa4,cwa5,cwa6,cwa7,cwa8,cwa9,potential_wetland,index,Index,mukey,hydclprs,aws025wta,drclassdcd
36,1,0,0,0,0,0,0,0,0,0,0,0,1,37,37,292681.0,0.0,4.48,Moderately well drained
50,1,0,0,0,0,0,0,0,0,0,0,0,1,51,51,292995.0,96.0,4.65,Very poorly drained
56,1,0,0,0,0,0,0,0,0,0,0,0,1,57,57,292980.0,8.0,4.67,Somewhat poorly drained


In [129]:
# Retain only the good records
df_full = df_full[good_records]
df_full.shape # 291 records removed

(14322, 29)

In [130]:
df_full.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'jurisdiction_type', 'da_number',
       'district', 'project_name', 'longitude', 'latitude',
       'date_issued_or_denied', 'rha_determination', 'cwa_determination',
       'rha1', 'rha2', 'cwa1', 'cwa2', 'cwa3', 'cwa4', 'cwa5', 'cwa6', 'cwa7',
       'cwa8', 'cwa9', 'potential_wetland', 'index', 'Index', 'mukey',
       'hydclprs', 'aws025wta', 'drclassdcd'],
      dtype='object')

# Split data into Train, Test1 and Test2

In [131]:
# verify if things are indeed working as expected

mygroup = df_full.groupby("district")

for group in mygroup:
    print(group[0])
    print("Original 0's and 1's")
    data_count_1 = group[1]["cwa_determination"][group[1]["cwa_determination"]==1].shape[0]
    data_count_0 = group[1]["cwa_determination"][group[1]["cwa_determination"]==0].shape[0]    
    print(data_count_0, data_count_1)
    print()
    
    train, test = (train_test_split(group[1],
                                    test_size=0.1,
                                    random_state = random_state, 
                                    stratify=group[1]["cwa_determination"]))
    
    print("train total records", train.shape[0])
    train_count_1 = train["cwa_determination"][train["cwa_determination"]==1].shape[0]
    train_count_0 = train["cwa_determination"][train["cwa_determination"]==0].shape[0]    
    print("train 0's and 1's:")
    print(train_count_0, train_count_1)
    
    print()
    
    print("test total records", test.shape[0])
    print("test 0's and 1's:")
    test_count_1 = test["cwa_determination"][test["cwa_determination"]==1].shape[0]
    test_count_0 = test["cwa_determination"][test["cwa_determination"]==0].shape[0]    
    print(test_count_0, test_count_1)    
    
    print("======================")
    

Alaska
Original 0's and 1's
346 101

train total records 402
train 0's and 1's:
311 91

test total records 45
test 0's and 1's:
35 10
Albuquerque
Original 0's and 1's
121 7

train total records 115
train 0's and 1's:
109 6

test total records 13
test 0's and 1's:
12 1
Baltimore
Original 0's and 1's
50 87

train total records 123
train 0's and 1's:
45 78

test total records 14
test 0's and 1's:
5 9
Buffalo
Original 0's and 1's
119 158

train total records 249
train 0's and 1's:
107 142

test total records 28
test 0's and 1's:
12 16
Charleston
Original 0's and 1's
1307 750

train total records 1851
train 0's and 1's:
1176 675

test total records 206
test 0's and 1's:
131 75
Chicago
Original 0's and 1's
236 332

train total records 511
train 0's and 1's:
212 299

test total records 57
test 0's and 1's:
24 33
Detroit
Original 0's and 1's
93 81

train total records 156
train 0's and 1's:
83 73

test total records 18
test 0's and 1's:
10 8
Fort Worth
Original 0's and 1's
6 12

train total re

ValueError: The test_size = 1 should be greater or equal to the number of classes = 2

In [None]:
# Implement the modified stratification

df_train = pd.DataFrame()
df_test1 = pd.DataFrame()
df_test2 = pd.DataFrame()

df_train_ = pd.DataFrame()
df_test1_ = pd.DataFrame()
df_test2_ = pd.DataFrame()

for group in df_full.groupby("district"):
    try:
        df_train_temp_, df_test2_ = (train_test_split(group[1], 
                                                 test_size=0.15, 
                                                 random_state = random_state, 
                                                 stratify=group[1]["cwa_determination"])) # 15% test

        df_train_, df_test1_ = (train_test_split(df_train_temp_, 
                                                 test_size=0.17647, 
                                                 random_state = random_state, 
                                                 stratify=df_train_temp_["cwa_determination"])) # 70% train, 15% dev
    except Exception as e:
        print(group[0], e)
    df_train = pd.concat([df_train, df_train_])
    df_test1 = pd.concat([df_test1, df_test1_])
    df_test2 = pd.concat([df_test2, df_test2_])


In [None]:
print("Training: {} (fraction = {})".format(df_train.shape, round(df_train.shape[0] / df_full.shape[0], 2)))
print("Test1   : {} (fraction = {})".format(df_test1.shape, round(df_test1.shape[0] / df_full.shape[0], 2)))
print("Test2   : {} (fraction = {})".format(df_test2.shape, round(df_test2.shape[0] / df_full.shape[0], 2)))

In [132]:
pickle.dump(df_train, open("2021.04.01_train_dataset","wb"), protocol=3)
pickle.dump(df_test1, open("2021.04.01_test1_dataset","wb"), protocol=3)
pickle.dump(df_test2, open("2021.04.01_test2_dataset","wb"), protocol=3)

# appendix

In [133]:
# https://stackoverflow.com/questions/55742246/test-train-split-with-stratify

In [134]:
df = pd.DataFrame(data={'a': np.random.rand(100000), 'b': np.random.rand(100000), 'c': 0})
df.loc[np.random.randint(0, 100000, 1000), 'c'] = 1
tr, ts = train_test_split(df, test_size=.2, stratify=df['c'])
print(tr.shape, ts.shape)

(80000, 3) (20000, 3)


In [135]:
df.c.unique()

array([0, 1])