# Step 1: Evaluate several models

The next code cell defines five different random forest models.  Run this code cell without changes.  (_To review **random forests**, look [here](https://www.kaggle.com/dansbecker/random-forests)._)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X_full = pd.read_csv('input/train.csv', index_col='Id')
X_test_full = pd.read_csv('input/test.csv', index_col='Id')

# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [8]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706


# Step 2: Generate test predictions

Great. You know how to evaluate what makes an accurate model. Now it's time to go through the modeling process and make predictions. In the line below, create a Random Forest model with the variable name `my_model`.

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X_full = pd.read_csv('input/train.csv', index_col='Id')
X_test_full = pd.read_csv('input/test.csv', index_col='Id')

In [16]:
list(set(X_full.columns).intersection(set(X_test_full.columns)))

['BsmtExposure',
 'PoolQC',
 'Neighborhood',
 'LandSlope',
 'Utilities',
 'Electrical',
 'KitchenAbvGr',
 'YearBuilt',
 'Foundation',
 'KitchenQual',
 'GarageCars',
 'MasVnrType',
 'LotArea',
 'EnclosedPorch',
 'Alley',
 'BsmtUnfSF',
 '3SsnPorch',
 'TotRmsAbvGrd',
 'BsmtQual',
 'YearRemodAdd',
 'OverallCond',
 'MoSold',
 'SaleCondition',
 'OverallQual',
 'TotalBsmtSF',
 'GarageYrBlt',
 'LotFrontage',
 'LandContour',
 'Functional',
 'GarageFinish',
 '1stFlrSF',
 'HalfBath',
 'BsmtHalfBath',
 'Exterior2nd',
 'BsmtFullBath',
 'PavedDrive',
 'GrLivArea',
 'MiscVal',
 'RoofStyle',
 '2ndFlrSF',
 'HouseStyle',
 'LotShape',
 'GarageCond',
 'YrSold',
 'LotConfig',
 'Condition2',
 'ExterCond',
 'Street',
 'BsmtFinSF1',
 'Fence',
 'GarageType',
 'GarageArea',
 'FullBath',
 'MSSubClass',
 'Heating',
 'GarageQual',
 'BsmtFinType1',
 'HeatingQC',
 'SaleType',
 'OpenPorchSF',
 'LowQualFinSF',
 'BsmtFinType2',
 'Fireplaces',
 'BldgType',
 'RoofMatl',
 'MiscFeature',
 'Condition1',
 'BsmtFinSF2',
 'Ext

In [39]:
# Obtain target and predictors
X_full = X_full.dropna(axis=1)
X_test_full = X_test_full.dropna(axis=1)

y = X_full.SalePrice
features = list(set(X_full.columns).intersection(set(X_test_full.columns)))
X = X_full[features].copy()

X_test = X_test_full[features].copy()

# Use the whole dataset
# Break off validation set from training data
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
#                                                       random_state=0)


`For categorical data we have to use one-hot encoding`

`li Use label Encoder() to convert consecutive int, but the average of 2 items might be another item? So not frequently used`


In [40]:
# Use the score_model function
from sklearn.metrics import mean_absolute_error

def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)


test_set may have a different number of dummy variables compared to the training_set. 

https://blog.csdn.net/maotianyi941005/article/details/88582464

1. get_dummies to both dataset
2. add surplus features from the training set to the test set
3. delete surplus features from test set

In [52]:
list(set(one_hot_X.columns)-set(one_hot_X_test.columns))

['HouseStyle_2.5Fin',
 'RoofMatl_ClyTile',
 'Heating_Floor',
 'RoofMatl_Membran',
 'RoofMatl_Roll',
 'RoofMatl_Metal',
 'Heating_OthW',
 'Condition2_RRNn',
 'Condition2_RRAn',
 'Condition2_RRAe']

In [59]:
# One way to reset columns

one_hot_X = pd.get_dummies(X)
one_hot_X_test = pd.get_dummies(X_test)
# added_columns = list(set(one_hot_X.columns)-set(one_hot_X_test.columns))




one_hot_X = one_hot_X.reindex(columns = one_hot_X_test.columns)



In [61]:
# This is a regression problem
from sklearn.ensemble import RandomForestRegressor

my_model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
my_model.fit(one_hot_X, y)
preds = my_model.predict(one_hot_X_test)
print(preds)

[125946.   155585.   165842.52 ... 146542.5  132606.9  239510.32]
