# <center> Random Forest Regressor
**Summary of Actions**
* Custom RMSE evaluation metric on logged values
* A simlpe rangom forest regressor model
 

### Import Preliminaries

In [None]:
%matplotlib inline

# Import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

# Import data
train_df = pd.DataFrame(pd.read_csv(
    'https://www.dropbox.com/s/8ftdd4tejbmesy4/featured_train_df.csv?dl=1', index_col='Id'))
test_df = pd.DataFrame(pd.read_csv(
    'https://www.dropbox.com/s/h672z8790ehkxuj/featured_test_df.csv?dl=1', index_col='Id'))

# Set pandas options
pd.set_option('precision', 1)
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
pd.set_option('precision', 3)

# List of dataframe
dfs = [train_df, test_df]

### DataFrame to Values

In [None]:
# Convert training and test data to values
X = train_df.loc[:, train_df.columns != 'SalePrice'].copy().values
y = train_df.SalePrice.values
X_test = test_df.values

### Fitting the Model 

In [None]:
# Creating and fitting the model
model = RandomForestRegressor()
model.fit(X,y)

### Model Predictions

In [None]:
# create predictions dataframe
predictions = pd.DataFrame(test_df.index)
saleprice = pd.DataFrame(model.predict(X_test),
                          columns=['SalePrice'])
predictions = pd.concat([predictions, saleprice],axis=1)

# View test predictions

In [None]:
# View training predictions
train_df_prediction = pd.DataFrame(model.predict(X), columns=['Pred SalePrice'])
train_df_prediction = train_df_prediction.set_index(train_df.index)
train_df_prediction = pd.concat([train_df['SalePrice'], train_df_prediction], axis=1)
train_df_prediction.sample(3)

### Residual Plot

In [None]:
# Plotting training data residuals
residplot = sns.residplot(
    train_df.SalePrice, train_df_prediction['Pred SalePrice'], color='darkred')
plt.title('Model Residual Plot')
residplot.set(ylim=(-200000, 200000))
residplot

### Model Crossvalidation

In [None]:
# Creating list to store KFold scores
scores = list()

# Retrieve KFold scorse on test data, 10 folds
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)

    y_pred = pd.Series(model.predict(X_test))
    y_test = pd.Series(y_test)

    rmse = sqrt(mean_squared_error(np.log(y_test), np.log(np.abs(y_pred))))

    scores.append(rmse)

# Mean Logged RMSE Evaulation metric for our model
scores = pd.Series(scores, name='Scores')
print('Logged RMSE:', round(scores.mean(),4))

### Recursive Feature Elimination

In [None]:
selector = RFECV(estimator=model, cv=10,
                scoring='neg_mean_squared_error')
selector.fit(X,y)
print('Optimal Number of Features %d' %selector.n_features_)

In [None]:
# Print the Optimal Features
rfeatures = train_df.drop(['SalePrice'], axis=1).columns[selector.support_]
pd.Series(rfeatures).sample(10)

In [None]:
# Reduced Datafarme to Values
X = train_df[list(rfeatures)].values
y = train_df.SalePrice.values
X_test = test_df[list(rfeatures)].values

# Fit the Model
model = RandomForestRegressor()
model.fit(X,y)

# create predictions dataframe
predictions = pd.DataFrame(test_df.index)
saleprice = pd.DataFrame(model.predict(X_test),
                          columns=['SalePrice'])
predictions = pd.concat([predictions, saleprice],axis=1)

### Export Results

In [None]:
# Exporting data without index
predictions.to_csv('Submissions/rfg_recursive_predicition.csv', index=False)

### Accuracy Scores

<br>Feature Enginnering 1 - RMSE: 0.1529
<br>Feature Enginnering 2 - RMSE: 0.1489
<br>Feature Enginnering 3 - RMSE: 0.1503

### Kaggle Scores

<br>Feature Engineering 1 - RMSE: 0.15486
<br>Feature Engineering 2 - RMSE: 0.15934
<br>Feature Enginnering 3 - RMSE: 0.15565