# Cover Type Prediction of Forests

In [1]:
import pandas as pd
import sklearn
import numpy as np

In [2]:
completeData = pd.read_csv("train-data.csv")
submissionData = pd.read_csv("test-data.csv")

In [3]:
completeData.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


Some data visualization in Trifacta software gave me some insights : *Cover_Type* distribution is balanced in complete data and there's no apparent missing values. Some *Hillshade_3pm* values look suspicious, but their impact is unsignificant.

In [4]:
df = completeData
idComplete = df.pop('Id')
df2 = submissionData
idSubmission = df2.pop('Id')

## Feature engineering

In order to avoir overfitting, it's often useful to reduce manually the number of features, based on data examination. *Soil_Type(s)* values are very sparse and we know that we can merge them, based on climatic zones and geologic zones (cf. Study Code USFS ELU Code Description).

In [5]:
def climatic_zones(df):
  # Merging "Lower montane dry" soil types
  df["Lower_montane_dry"] = df.Soil_Type1 + df.Soil_Type2 + df.Soil_Type3 + df.Soil_Type4 + df.Soil_Type5 + df.Soil_Type6
  # Merging "Lower montane" soil types
  df["Lower_montane"] = df.Soil_Type7 + df.Soil_Type8
  # Merging "Montane dry" soil types
  df["Montane_dry"] = df.Soil_Type9    
  # Merging "Montane" soil types
  df["Montane"] = df.Soil_Type10 + df.Soil_Type11 + df.Soil_Type12 + df.Soil_Type13
  # Merging "Montane dry and montane" soil types
  df["Montane_dry_and_montane"] = df.Soil_Type14 + df.Soil_Type15    
  # Merging "Montane and subalpine" soil types
  df["Montane_and_subalpine"] = df.Soil_Type16 + df.Soil_Type17 + df.Soil_Type18    
  # Merging "Subalpine" soil types
  df["Subalpine"] = df.Soil_Type19 + df.Soil_Type20 + df.Soil_Type21 + df.Soil_Type22 + df.Soil_Type23 + df.Soil_Type24 + df.Soil_Type25 + df.Soil_Type26 + df.Soil_Type27 + df.Soil_Type28 + df.Soil_Type29 + df.Soil_Type30 + df.Soil_Type31 + df.Soil_Type32 + df.Soil_Type33 + df.Soil_Type34    
  # Merging "Alpine" soil types
  df["Alpine"] = df.Soil_Type35 + df.Soil_Type36 + df.Soil_Type37 + df.Soil_Type38 + df.Soil_Type39 + df.Soil_Type40
  return df

In [6]:
def geologic_zones(df):
  # Merging "Igneous and metamorphic" soil types
  df["Igneous_and_metamorphic"] = df.Soil_Type1 + df.Soil_Type2 + df.Soil_Type3 + df.Soil_Type4 + df.Soil_Type5 + df.Soil_Type6 + df.Soil_Type10 + df.Soil_Type11 + df.Soil_Type12 + df.Soil_Type13 + df.Soil_Type18 + df.Soil_Type24 + df.Soil_Type25 + df.Soil_Type26 + df.Soil_Type27 + df.Soil_Type28 + df.Soil_Type29 + df.Soil_Type30 + df.Soil_Type31 + df.Soil_Type32 + df.Soil_Type33 + df.Soil_Type34 + df.Soil_Type35 + df.Soil_Type36 + df.Soil_Type37 + df.Soil_Type38 + df.Soil_Type39 + df.Soil_Type40
  # Merging "Mixed sedimentary" soil types
  df["Mixed_sedimentary"] = df.Soil_Type7 + df.Soil_Type8
  # Merging "Glacial" soil types
  df["Glacial"] = df.Soil_Type9 + df.Soil_Type22 + df.Soil_Type23
  # Merging "Alluvium" soil types
  df["Alluvium"] = df.Soil_Type14 + df.Soil_Type15 + df.Soil_Type16 + df.Soil_Type17 + df.Soil_Type19 + df.Soil_Type20 + df.Soil_Type21
  return df

As we extracted all information from *Soil_Type(s)* values, we can remove all these columns now.

In [7]:
def remove_soil_cols(df):
    df.drop(['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4',
        'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
        'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
        'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
        'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
        'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
        'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
        'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
        'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
        'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'], axis=1, inplace=True)
    return df

Then, we're merging both horizontal and vertical distances to hydrology into one column only, using Pythagorean theorem.

In [8]:
def distance_to_hydrology(df):
    df["Distance_To_Hydrology"] = np.sqrt(np.multiply(df.Horizontal_Distance_To_Hydrology,df.Horizontal_Distance_To_Hydrology) + np.multiply(df.Vertical_Distance_To_Hydrology,df.Vertical_Distance_To_Hydrology))
    df.drop(['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'], axis=1, inplace=True)
    return df

As *Aspect* feature is an angle, we need to work on it so that hidden similarity between distant values like 10 and 350 is reflected. In order to do that, we'll replace *Aspect* values by their sinus and cosinus.

In [9]:
def new_aspect(df):
    df["Sin"] = np.sin(df["Aspect"] * np.pi / 180)
    df["Cos"] = np.cos(df["Aspect"] * np.pi / 180)    
    df.drop(['Aspect'], axis=1, inplace=True)
    return df

In [10]:
df = climatic_zones(df)
df = geologic_zones(df)
df = remove_soil_cols(df)
df = distance_to_hydrology(df)
df = new_aspect(df)

In [11]:
df.columns

Index(['Elevation', 'Slope', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Cover_Type', 'Lower_montane_dry', 'Lower_montane', 'Montane_dry',
       'Montane', 'Montane_dry_and_montane', 'Montane_and_subalpine',
       'Subalpine', 'Alpine', 'Igneous_and_metamorphic', 'Mixed_sedimentary',
       'Glacial', 'Alluvium', 'Distance_To_Hydrology', 'Sin', 'Cos'],
      dtype='object')

In [12]:
y = df.pop('Cover_Type').values
X = df.values

## Data pre-processing

First, we normalize the data.

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

Then we split the complete data in order to get a train set and a test set.

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

I started to test classifiers using the simplest one: Logistic Regression. This gave me a first reference score.

Then, I tested a few others and compared their scores; best results on the test set were the ones using Random Forest and SVC classifiers.

As a next step, I tested both classifiers in a pipeline including some feature selection. SVC gave the best score, so I kept this classifier and finally performed an iterative exhaustive search on C and gamma parameters.

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#clf = linear_model.LogisticRegression()
#clf = RandomForestClassifier(n_estimators=100)
#clf = SVC()

pipe = Pipeline([
  ('fs', SelectFromModel(ExtraTreesClassifier(n_estimators=100))),
  ('clf', SVC())
])

params = {
    'clf__gamma': [0.09, 0.1, 0.11],
    'clf__C': [125, 150, 175],
}

gs = GridSearchCV(pipe, params)
gs.fit(X, y)

#clf.fit(X_train, y_train)
#score = clf.score(X_test, y_test)
#print(score)
    
#pipe.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('fs', SelectFromModel(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0....,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__gamma': [0.09, 0.1, 0.11], 'clf__C': [125, 150, 175]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [16]:
gs.best_params_

{'clf__C': 125, 'clf__gamma': 0.1}

Once the model was ready, I applied it to submission data.

In [17]:
df2 = climatic_zones(df2)
df2 = geologic_zones(df2)
df2 = remove_soil_cols(df2)
df2 = distance_to_hydrology(df2)
df2 = new_aspect(df2)

In [18]:
X_kaggle = df2.values
X_kaggle = scaler.transform(X_kaggle)
#y_kaggle = pipe.predict(X_kaggle)
y_kaggle = gs.best_estimator_.predict(X_kaggle)

In [19]:
df_kaggle = pd.DataFrame(y_kaggle, idSubmission, columns=['Cover_Type'])
df_kaggle.head()

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,7
15122,7
15123,7
15124,7
15125,7


In [20]:
df_kaggle.to_csv("submission.csv")

Et voilà! :)

**Nota Bene: Through repeated submissions on Kaggle, I quickly noticed that *Cover_Type* distribution in submission data wasn't balanced at all though it was perfectly balanced in complete data that we used to train our models. However, I decided not to include this information in my process (for example, I could have used the class_weight parameter in the SVC to reflect the correct distribution of submission data) as I consider it as data leakage.**