In [108]:
import pandas as pd 
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [122]:
data = pd.read_csv("train.csv")
print (data.describe())
print (data.columns)
scaler = MinMaxScaler()
data['Elevation'] = scaler.fit_transform(data['Elevation'].values.reshape(-1,1))
data['Aspect'] = scaler.fit_transform(data['Aspect'].values.reshape(-1,1))
data['Slope'] = scaler.fit_transform(data['Slope'].values.reshape(-1,1))
data['Horizontal_Distance_To_Hydrology'] = scaler.fit_transform(data['Horizontal_Distance_To_Hydrology'].values.reshape(-1,1))
data['Vertical_Distance_To_Hydrology'] = scaler.fit_transform(data['Vertical_Distance_To_Hydrology'].values.reshape(-1,1))
data['Horizontal_Distance_To_Roadways'] = scaler.fit_transform(data['Horizontal_Distance_To_Roadways'].values.reshape(-1,1))
data['Hillshade_9am'] = scaler.fit_transform(data['Hillshade_9am'].values.reshape(-1,1))
data['Hillshade_Noon'] = scaler.fit_transform(data['Hillshade_Noon'].values.reshape(-1,1))
data['Hillshade_3pm'] = scaler.fit_transform(data['Hillshade_3pm'].values.reshape(-1,1))
data['Horizontal_Distance_To_Fire_Points'] = scaler.fit_transform(data['Horizontal_Distance_To_Fire_Points'].values.reshape(-1,1))
X = np.array(data.as_matrix(columns=data.columns[1:55]))
Y = np.array(data["Cover_Type"].tolist())

shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]
print('data shape: ', X.shape)
print('label shape:', Y.shape)
test_data, test_labels = X[11000:], Y[11000:]
dev_data, dev_labels = X[10000:11000], Y[10000:11000]
train_data, train_labels = X[:10000], Y[:10000]

                Id     Elevation        Aspect         Slope  \
count  15120.00000  15120.000000  15120.000000  15120.000000   
mean    7560.50000   2749.322553    156.676653     16.501587   
std     4364.91237    417.678187    110.085801      8.453927   
min        1.00000   1863.000000      0.000000      0.000000   
25%     3780.75000   2376.000000     65.000000     10.000000   
50%     7560.50000   2752.000000    126.000000     15.000000   
75%    11340.25000   3104.000000    261.000000     22.000000   
max    15120.00000   3849.000000    360.000000     52.000000   

       Horizontal_Distance_To_Hydrology  Vertical_Distance_To_Hydrology  \
count                      15120.000000                    15120.000000   
mean                         227.195701                       51.076521   
std                          210.075296                       61.239406   
min                            0.000000                     -146.000000   
25%                           67.000000         

In [130]:
def KNN(k_values):

### STUDENT START ###
    for k_value in k_values:
        model = KNeighborsClassifier(n_neighbors=k_value)
        model.fit(train_data,train_labels)
        predicted= model.predict(dev_data)
        print ( 'Accuray on dev set for k = ' , k_value , " : " , model.score(dev_data , dev_labels))
        print(classification_report(dev_labels, predicted))
    
### STUDENT END ###

k_values = [1, 3, 5, 7, 9]
KNN(k_values)

Accuray on dev set for k =  1  :  0.813
              precision    recall  f1-score   support

           1       0.68      0.66      0.67       137
           2       0.72      0.61      0.66       148
           3       0.83      0.80      0.82       150
           4       0.88      0.93      0.90       139
           5       0.82      0.90      0.86       126
           6       0.80      0.85      0.83       148
           7       0.92      0.93      0.93       152

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000

Accuray on dev set for k =  3  :  0.79
              precision    recall  f1-score   support

           1       0.65      0.66      0.65       137
           2       0.70      0.57      0.63       148
           3       0.77      0.75      0.76       150
           4       0.86      0.92      0.89       139
           5       0.83      0.91      0.87       126

In [138]:

    
def logistic_regression(train_vector , train_labels , dev_vector , dev_labels ):
    print ( "\033[1m" ,  "Logistic Regression " , "\033[0;0m" )
    C_values= { 'C' : [ 0.01, 0.05 , 0.1,  1.0,10.0]}
    grd_model = GridSearchCV( LogisticRegression(penalty='elasticnet', solver='saga' , l1_ratio =0),  param_grid = C_values ,return_train_score = 1)
    grd_model.fit(train_vector,train_labels)
    print ("Best fit parameters :")
    print (grd_model.best_params_)
    print ("Best fit model F1 score :")
    predicted= grd_model.predict(dev_vector)
    print(metrics.f1_score(dev_labels, predicted , average='micro'))
warnings.filterwarnings(action='ignore')    
logistic_regression(train_data , train_labels , dev_data , dev_labels )   

[1m Logistic Regression  [0;0m
Best fit parameters :
{'C': 10.0}
Best fit model F1 score :
0.66


In [101]:
pca = PCA(n_components=20)
pca.fit(train_data)
train_pca = pca.transform(train_data)
dev_pca = pca.transform(dev_data)
logistic_regression(train_pca , train_labels , dev_pca , dev_labels ) 

[1m Logistic Regression  [0;0m
Best fit parameters :
{'C': 10.0}
Best fit model F1 score :
0.615


In [126]:
def classify_multinomial(train_data , train_labels , dev_data , dev_labels ):
    model_m = MultinomialNB(alpha = 1)
    model_m.fit(train_data, train_labels)
    print ("Accuracy for MultinomialNB :")
    print(model_m.score(dev_data , dev_labels)) 
classify_multinomial(train_data , train_labels , dev_data , dev_labels )     

Accuracy for MultinomialNB :
0.564


In [132]:
def classify_random_forest (train_data , train_labels , dev_data , dev_labels  ):
    C_values= { 'max_depth' : [20,25 , 35,40,50] , 'n_estimators' : [5,15,20,25] }
    grd_model = GridSearchCV( RandomForestClassifier(random_state=0),  param_grid = C_values ,return_train_score = 1)
    grd_model.fit(train_data,train_labels)
    print ("Best fit parameters :")
    print (grd_model.best_params_)
    print ("Best fit model F1 score :")
    predicted= grd_model.predict(dev_data)
    print(metrics.f1_score(dev_labels, predicted , average='micro')) 
    return grd_model
grd_model = classify_random_forest(train_data , train_labels , dev_data , dev_labels )  


Best fit parameters :
{'max_depth': 35, 'n_estimators': 25}
Best fit model F1 score :
0.843


In [152]:
imp_feature = np.argsort(grd_model.best_estimator_.feature_importances_)[::-1] + 1
print (data.columns[imp_feature])
predicted= grd_model.predict(test_data)
print(metrics.f1_score(test_labels, predicted , average='micro')) 

Index(['Elevation', 'Horizontal_Distance_To_Roadways',
       'Horizontal_Distance_To_Fire_Points',
       'Horizontal_Distance_To_Hydrology', 'Wilderness_Area4', 'Hillshade_9am',
       'Vertical_Distance_To_Hydrology', 'Aspect', 'Hillshade_3pm',
       'Hillshade_Noon', 'Slope', 'Soil_Type10', 'Soil_Type3', 'Soil_Type39',
       'Wilderness_Area1', 'Wilderness_Area3', 'Soil_Type38', 'Soil_Type4',
       'Soil_Type40', 'Soil_Type2', 'Soil_Type13', 'Soil_Type30',
       'Soil_Type17', 'Soil_Type22', 'Soil_Type29', 'Soil_Type23',
       'Soil_Type32', 'Soil_Type12', 'Wilderness_Area2', 'Soil_Type33',
       'Soil_Type11', 'Soil_Type31', 'Soil_Type24', 'Soil_Type35',
       'Soil_Type6', 'Soil_Type1', 'Soil_Type20', 'Soil_Type5', 'Soil_Type16',
       'Soil_Type26', 'Soil_Type37', 'Soil_Type18', 'Soil_Type14',
       'Soil_Type19', 'Soil_Type34', 'Soil_Type21', 'Soil_Type36',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type9', 'Soil_Type25',
       'Soil_Type15', 'Soil_Type8', 'Soil_Type7