# Covertype Data Set Preprocessing

In [40]:
import csv
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split

### Set the paths

In [41]:
PREFIX_PATH = './dataset/covertype/preprocessed/'

FULL_DATASET = os.path.join(PREFIX_PATH, 'full', 'dataset.csv')
Path(FULL_DATASET).parent.mkdir(parents=True, exist_ok=True)

TRAINING_DATASET = os.path.join(PREFIX_PATH, 'training', 'dataset.csv')
Path(TRAINING_DATASET).parent.mkdir(parents=True, exist_ok=True)

TRAINING_DATASET_WITH_MISSING = os.path.join(PREFIX_PATH, 'training_missing', 'dataset.csv')
Path(TRAINING_DATASET_WITH_MISSING).parent.mkdir(parents=True, exist_ok=True)

EVALUATION_DATASET = os.path.join(PREFIX_PATH, 'evaluation', 'dataset.csv')
Path(EVALUATION_DATASET).parent.mkdir(parents=True, exist_ok=True)

EVALUATION_DATASET_WITH_ANOMALIES = os.path.join(PREFIX_PATH, 'evaluation_anomalies', 'dataset.csv')
Path(EVALUATION_DATASET_WITH_ANOMALIES).parent.mkdir(parents=True, exist_ok=True)

SERVING_DATASET = os.path.join(PREFIX_PATH, 'serving', 'dataset.csv')
Path(SERVING_DATASET).parent.mkdir(parents=True, exist_ok=True)

ORIGINAL_DATASET_PATH = './dataset/covertype/covtype.data'

## Preprocess the original dataset

### Load the dataset

In [42]:
df = pd.read_csv(ORIGINAL_DATASET_PATH, header=None)
print(df.shape)
df.head()

(581012, 55)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


### Configure soil type and wilderness area domains

In [43]:
soil_type = [
"1", "C2702", "Cathedral family - Rock outcrop complex, extremely stony.",
"2", "C2703", "Vanet - Ratake families complex, very stony.",
"3", "C2704", "Haploborolis - Rock outcrop complex, rubbly.",
"4", "C2705", "Ratake family - Rock outcrop complex, rubbly.",
"5", "C2706", "Vanet family - Rock outcrop complex complex, rubbly.",
"6", "C2717", "Vanet - Wetmore families - Rock outcrop complex, stony.",
"7", "C3501", "Gothic family.",
"8", "C3502", "Supervisor - Limber families complex.",
"9", "C4201", "Troutville family, very stony.",
"10", "C4703", "Bullwark - Catamount families - Rock outcrop complex, rubbly.",
"11", "C4704", "Bullwark - Catamount families - Rock land complex, rubbly.",
"12", "C4744", "Legault family - Rock land complex, stony.",
"13", "C4758", "Catamount family - Rock land - Bullwark family complex, rubbly.",
"14", "C5101", "Pachic Argiborolis - Aquolis complex.",
"15", "C5151", "unspecified in the USFS Soil and ELU Survey.",
"16", "C6101", "Cryaquolis - Cryoborolis complex.",
"17", "C6102", "Gateview family - Cryaquolis complex.",
"18", "C6731", "Rogert family, very stony.",
"19", "C7101", "Typic Cryaquolis - Borohemists complex.",
"20", "C7102", "Typic Cryaquepts - Typic Cryaquolls complex.",
"21", "C7103", "Typic Cryaquolls - Leighcan family, till substratum complex.",
"22", "C7201", "Leighcan family, till substratum, extremely bouldery.",
"23", "C7202", "Leighcan family, till substratum - Typic Cryaquolls complex.",
"24", "C7700", "Leighcan family, extremely stony.",
"25", "C7701", "Leighcan family, warm, extremely stony.",
"26", "C7702", "Granile - Catamount families complex, very stony.",
"27", "C7709", "Leighcan family, warm - Rock outcrop complex, extremely stony.",
"28", "C7710", "Leighcan family - Rock outcrop complex, extremely stony.",
"29", "C7745", "Como - Legault families complex, extremely stony.",
"30", "C7746", "Como family - Rock land - Legault family complex, extremely stony.",
"31", "C7755", "Leighcan - Catamount families complex, extremely stony.",
"32", "C7756", "Catamount family - Rock outcrop - Leighcan family complex, extremely stony.",
"33", "C7757", "Leighcan - Catamount families - Rock outcrop complex, extremely stony.",
"34", "C7790", "Cryorthents - Rock land complex, extremely stony.",
"35", "C8703", "Cryumbrepts - Rock outcrop - Cryaquepts complex.",
"36", "C8707", "Bross family - Rock land - Cryumbrepts complex, extremely stony.",
"37", "C8708", "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.",
"38", "C8771", "Leighcan - Moran families - Cryaquolls complex, extremely stony.",
"39", "C8772", "Moran family - Cryorthents - Leighcan family complex, extremely stony.",
"40", "C8776", "Moran family - Cryorthents - Rock land complex, extremely stony.",
]

wilderness_area = [
"Rawah", "Rawah Wilderness Area",
"Neota", "Neota Wilderness Area",
"Commanche", "Comanche Peak Wilderness Area",
"Cache", "Cache la Poudre Wilderness Area"
]

### Map one-hot encoded values to categorical domains

In [44]:
soil = df.loc[:, 14:53].apply(lambda x: soil_type[1::3][x.to_numpy().nonzero()[0][0]], axis=1)
soil

0         C7745
1         C7745
2         C4744
3         C7746
4         C7745
          ...  
581007    C2703
581008    C2703
581009    C2703
581010    C2703
581011    C2703
Length: 581012, dtype: object

In [45]:
wilderness = df.loc[:, 10:13].apply(lambda x: wilderness_area[0::2][x.to_numpy().nonzero()[0][0]], axis=1)
wilderness

0             Rawah
1             Rawah
2             Rawah
3             Rawah
4             Rawah
            ...    
581007    Commanche
581008    Commanche
581009    Commanche
581010    Commanche
581011    Commanche
Length: 581012, dtype: object

### Create a dataset with column names and categorical values replacing one-hot encoded soil type and wilderness areas

In [46]:
COLUMN_NAMES = [
    'Elevation', 
    'Aspect', 
    'Slope', 
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'Wilderness_Area',
    'Soil_Type',
    'Cover_Type']

df_full = pd.concat([df.loc[:, 0:9], wilderness, soil, df.loc[:, 54]], axis=1, ignore_index=True)
df_full.columns = COLUMN_NAMES
df_full

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,Rawah,C7745,5
1,2590,56,2,212,-6,390,220,235,151,6225,Rawah,C7745,5
2,2804,139,9,268,65,3180,234,238,135,6121,Rawah,C4744,2
3,2785,155,18,242,118,3090,238,238,122,6211,Rawah,C7746,2
4,2595,45,2,153,-1,391,220,234,150,6172,Rawah,C7745,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,Commanche,C2703,3
581008,2391,152,19,67,12,95,240,237,119,845,Commanche,C2703,3
581009,2386,159,17,60,7,90,236,241,130,854,Commanche,C2703,3
581010,2384,170,15,60,5,90,230,245,143,864,Commanche,C2703,3


### Save the dataset to CSV file

In [47]:
df_full.to_csv(FULL_DATASET, header=True, index=False)

In [48]:
!head $FULL_DATASET

Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
2596,51,3,258,0,510,221,232,148,6279,Rawah,C7745,5
2590,56,2,212,-6,390,220,235,151,6225,Rawah,C7745,5
2804,139,9,268,65,3180,234,238,135,6121,Rawah,C4744,2
2785,155,18,242,118,3090,238,238,122,6211,Rawah,C7746,2
2595,45,2,153,-1,391,220,234,150,6172,Rawah,C7745,5
2579,132,6,300,-15,67,230,237,140,6031,Rawah,C7745,2
2606,45,7,270,5,633,222,225,138,6256,Rawah,C7745,5
2605,49,4,234,7,573,222,230,144,6228,Rawah,C7745,5
2617,45,9,240,56,666,223,221,133,6244,Rawah,C7745,5


## Create training, validation, testing and serving splits.

In [49]:
df_full = df = pd.read_csv(FULL_DATASET, dtype={'Soil_Type': object})
df_full

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,Rawah,C7745,5
1,2590,56,2,212,-6,390,220,235,151,6225,Rawah,C7745,5
2,2804,139,9,268,65,3180,234,238,135,6121,Rawah,C4744,2
3,2785,155,18,242,118,3090,238,238,122,6211,Rawah,C7746,2
4,2595,45,2,153,-1,391,220,234,150,6172,Rawah,C7745,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,Commanche,C2703,3
581008,2391,152,19,67,12,95,240,237,119,845,Commanche,C2703,3
581009,2386,159,17,60,7,90,236,241,130,854,Commanche,C2703,3
581010,2384,170,15,60,5,90,230,245,143,864,Commanche,C2703,3


In [50]:
df_full.Soil_Type.value_counts()

C7745    115247
C7202     57752
C7756     52519
C7757     45154
C7201     33373
C4703     32634
C7746     30170
C4744     29971
C7755     25666
C7700     21278
C4758     17431
C8771     15573
C8772     13806
C4704     12410
C2705     12396
C7102      9259
C8776      8750
C2703      7525
C2717      6575
C2704      4823
C7101      4021
C6102      3422
C2702      3031
C6101      2845
C7702      2589
C6731      1899
C8703      1891
C7790      1611
C2706      1597
C4201      1147
C7709      1086
C7710       946
C7103       838
C5101       599
C7701       474
C8708       298
C3502       179
C8707       119
C3501       105
C5151         3
Name: Soil_Type, dtype: int64

In [51]:
df_5151 = df_full[df_full['Soil_Type']=='C5151']
df_no_5151 = df_full[df_full['Soil_Type']!='C5151']

In [52]:
df_5151

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
241543,2078,34,10,0,0,212,219,218,134,484,Cache,C5151,6
241544,2080,13,19,30,0,192,198,197,132,499,Cache,C5151,6
241545,2076,27,24,30,5,175,201,180,105,516,Cache,C5151,6


In [53]:
df_no_5151

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,Rawah,C7745,5
1,2590,56,2,212,-6,390,220,235,151,6225,Rawah,C7745,5
2,2804,139,9,268,65,3180,234,238,135,6121,Rawah,C4744,2
3,2785,155,18,242,118,3090,238,238,122,6211,Rawah,C7746,2
4,2595,45,2,153,-1,391,220,234,150,6172,Rawah,C7745,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,Commanche,C2703,3
581008,2391,152,19,67,12,95,240,237,119,845,Commanche,C2703,3
581009,2386,159,17,60,7,90,236,241,130,854,Commanche,C2703,3
581010,2384,170,15,60,5,90,230,245,143,864,Commanche,C2703,3


In [54]:
df_train, df_other = train_test_split(df_no_5151, train_size=431009, stratify=df_no_5151.Cover_Type)
df_evaluate, df_serving = train_test_split(df_other, train_size=75000, stratify=df_other.Cover_Type)
df_serving = df_serving.drop(columns=['Cover_Type'])
print(df_train.shape)
print(df_evaluate.shape)
print(df_serving.shape)

(431009, 13)
(75000, 13)
(75000, 12)


Add some missing values to the training split.

In [55]:
df_train_missing = df_train.reset_index(drop=True)
df_train_missing.loc[0:8999, 'Horizontal_Distance_To_Hydrology'] = None
df_train_missing

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2916,229,14,,270,1207,198,252,189,1846,Commanche,C7202,2
1,3212,181,14,,9,2807,224,248,154,2520,Rawah,C7745,7
2,3132,87,13,,-3,2871,239,218,108,5641,Rawah,C7745,1
3,3028,89,15,,89,350,242,214,98,1892,Commanche,C7757,2
4,3268,6,9,,34,3218,210,223,151,2242,Rawah,C7745,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
431004,3228,149,30,124.0,44,1707,242,222,85,417,Commanche,C8772,7
431005,3111,264,24,426.0,64,5871,152,243,225,1822,Rawah,C7746,2
431006,3088,251,10,190.0,21,5666,198,247,187,2218,Rawah,C7202,2
431007,3393,274,8,376.0,39,1689,200,242,182,700,Commanche,C8771,7


Create the evaluation split where some values of Slope are more than 90 degrees and 3 examples have 5151 code for soil type, which is not present in the training split.

In [56]:
df_evaluate_anomalies = df_evaluate.reset_index(drop=True)
df_evaluate_anomalies.loc[0:4, 'Slope'] = 110
df_evaluate_anomalies = pd.concat([df_evaluate_anomalies, df_5151])
df_evaluate_anomalies

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3178,58,110,485,50,4513,228,218,123,2240,Rawah,C7745,2
1,3010,45,110,30,0,2940,219,235,152,2224,Commanche,C7202,1
2,3044,106,110,510,61,2644,245,220,99,2711,Commanche,C7702,2
3,3031,151,110,641,214,2100,222,239,152,2800,Commanche,C7756,2
4,2761,144,110,190,31,2569,232,239,138,2139,Commanche,C4704,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74998,3331,107,17,150,31,2466,247,217,92,1911,Commanche,C7756,1
74999,3223,292,4,421,39,459,207,239,170,1505,Commanche,C7756,2
241543,2078,34,10,0,0,212,219,218,134,484,Cache,C5151,6
241544,2080,13,19,30,0,192,198,197,132,499,Cache,C5151,6


In [57]:
df_evaluate_anomalies.Soil_Type.value_counts()

C7745    14977
C7202     7497
C7756     6789
C7757     5903
C7201     4310
C4703     4212
C4744     3913
C7746     3883
C7755     3251
C7700     2710
C4758     2196
C8771     1982
C8772     1759
C4704     1582
C2705     1563
C7102     1190
C8776     1080
C2703     1001
C2717      786
C2704      682
C7101      483
C6102      458
C2702      387
C6101      375
C7702      364
C8703      252
C6731      239
C7790      225
C2706      189
C4201      169
C7709      144
C7710      115
C7103      110
C5101       75
C7701       61
C8708       34
C3502       25
C8707       15
C3501       14
C5151        3
Name: Soil_Type, dtype: int64

### Save the splits to local files.

In [58]:
df_train.to_csv(TRAINING_DATASET, header=True, index=False)
df_train_missing.to_csv(TRAINING_DATASET_WITH_MISSING, header=True, index=False)
df_evaluate.to_csv(EVALUATION_DATASET, header=True, index=False)
df_evaluate_anomalies.to_csv(EVALUATION_DATASET_WITH_ANOMALIES, header=True, index=False)
df_serving.to_csv(SERVING_DATASET, header=True, index=False)

In [61]:
!find {PREFIX_PATH} -name *.csv*

./dataset/covertype/preprocessed/training_missing/dataset.csv
./dataset/covertype/preprocessed/evaluation_anomalies/dataset.csv
./dataset/covertype/preprocessed/full/dataset.csv
./dataset/covertype/preprocessed/serving/dataset.csv
./dataset/covertype/preprocessed/training/dataset.csv
./dataset/covertype/preprocessed/evaluation/dataset.csv
