# Kaggle Challenge Day 1: Permutation importance
Machine Learning for Insights challenge

- Feature engineering - involves repeatedly creating new features using transformations of your raw data or features you've previously created

- Which features have the biggest impact on predictions? This concept is called feature importance

**Linear Algebra Topics such as:
- Principal Component Analysis (PCA)
- Singular Value Decomposition (SVD)
- Eigendecomposition of a matrix
- LU Decomposition
- QR Decomposition/Factorization
- Symmetric Matrices
- Orthogonalization & Orthonormalization
- Matrix Operations, Projections, Eigenvalues & Eigenvectors, Vector Spaces and Norms are needed for understanding the optimization methods used for machine learning**



In [14]:
#Code example

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
data = pd.read_csv('FIFA 2018 Statistics.csv')
y = (data['Man of the Match'] == "Yes")  # Convert from string "Yes"/"No" to binary
feature_names = [i for i in data.columns if data[i].dtype in [np.int64]]
X = data[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
my_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)
my_model


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Permutation Importance
Taxi Fare Prediction competition

In [31]:
# Loading data, dividing, modeling and EDA below
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data = pd.read_csv('TaxiFare.csv', nrows=50000)

# Remove data with extreme outlier coordinates or negative fares
data = data.query('pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                  'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                  'pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                  'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                  'fare_amount > 0'
                  )

y = data.fare_amount
y

base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude']

X = data[base_features]


print(X.shape)
print(y.shape)



# Features


train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# number of trees in forest
# random_state =1 seed
# .fit(fits the data)

first_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(train_X, train_y)



# understanding training data
train_X.describe()
train_y.describe()










(31289, 4)
(31289,)


count    23466.000000
mean         8.472539
std          4.609747
min          0.010000
25%          5.500000
50%          7.500000
75%         10.100000
max        165.000000
Name: fare_amount, dtype: float64

In [38]:
import eli5
from eli5.sklearn import PermutationImportance


# Make a small change to the code below to use in this problem. 
perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y)

eli5.show_weights(perm, feature_names = val_X.columns.tolist())

Weight,Feature
0.8585  ± 0.0251,dropoff_latitude
0.8408  ± 0.0358,pickup_latitude
0.6113  ± 0.0607,pickup_longitude
0.5435  ± 0.0175,dropoff_longitude


In [37]:
# create new features
data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)

features_2  = ['pickup_longitude',
               'pickup_latitude',
               'dropoff_longitude',
               'dropoff_latitude',
               'abs_lat_change',
               'abs_lon_change']

X = data[features_2]
new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1)
second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)

# Create a PermutationImportance object on second_model and fit it to new_val_X and new_val_y
# Use a random_state of 1 for reproducible results that match the expected solution.
perm2 = PermutationImportance(second_model, random_state=1).fit(new_val_X, new_val_y)

# show the weights for the permutation importance you just calculated
eli5.show_weights(perm2, feature_names = new_val_X.columns.tolist())


Weight,Feature
0.5783  ± 0.0295,abs_lat_change
0.4467  ± 0.0509,abs_lon_change
0.0858  ± 0.0333,pickup_latitude
0.0735  ± 0.0101,dropoff_longitude
0.0733  ± 0.0113,dropoff_latitude
0.0613  ± 0.0063,pickup_longitude


In [39]:
# Create a PermutationImportance object on second_model and fit it to new_val_X and new_val_y
perm2 = PermutationImportance(second_model).fit(new_val_X, new_val_y)

# show the weights for the permutation importance you just calculated
eli5.show_weights(perm2, feature_names = features_2)

Weight,Feature
0.5787  ± 0.0475,abs_lat_change
0.4415  ± 0.0566,abs_lon_change
0.0860  ± 0.0189,pickup_latitude
0.0792  ± 0.0102,dropoff_latitude
0.0778  ± 0.0098,dropoff_longitude
0.0641  ± 0.0156,pickup_longitude
