In [2]:
import pandas as pd  

home_data = pd.read_csv("train.csv")
# We use the read_csv() function to read data from the data file we need to import for our ml model

In [3]:
# We first examine the data for the output data (y) we want to train for our model

print (home_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
# Now we know that SalePrice is the outcome (prediction) so we use the data from this 'SalePrice' Column as a label_training (Y)

y = home_data.SalePrice # We can use [] also instead of .

print (type(y)) # Here the return datatype is pandas Series

<class 'pandas.core.series.Series'>


In [5]:
# Now we must create the features_train for the training set which is denoted as x

features_column = ['LotArea',
     'YearBuilt',
     '1stFlrSF',
     '2ndFlrSF',
     'FullBath',
     'BedroomAbvGr',
    'TotRmsAbvGrd']

x = home_data[features_column] # Now we got the specific data we intend to use as training features (input) (x)

In [6]:
from sklearn.tree import DecisionTreeRegressor # we use DecisionTreeRegressor as our model
model = DecisionTreeRegressor(random_state = 1)
model.fit(x,y) # Traing the model

DecisionTreeRegressor(random_state=1)

In [7]:
pred = model.predict(x) # Here prediction using the same data we use in training
print (pred[0:5]) # we want to print the first 5

[208500. 181500. 223500. 140000. 250000.]


In [8]:
# Let compare with y we used in training

y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [9]:
pred[0:5] == y.head() # Here we can see all True because we use the same dataset we train in prediction step

0    True
1    True
2    True
3    True
4    True
Name: SalePrice, dtype: bool

In [10]:
# Let try with another one
test = "test.csv"

test_data = pd.read_csv(test)

print (test_data.columns) # we check the dataframe


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [11]:
# We extract the same input features which we would like to use as input as we train our model
test_data_x = test_data[features_column] # we reuse the features_column which we have already declare in above

In [12]:
test_pred = model.predict(test_data_x) # Here we use the test dataset for testing how our model work on test data

print (test_pred) #Here we predict

[ 94750. 137500. 173000. ... 157900.  93500. 225000.]


# Model Validation

In [31]:
# How we calculate the MAE (mean absolute error)
# error = actual - predicted value
# mean (error)

from sklearn.metrics import mean_absolute_error

In [32]:
import pandas as pd  

file_path = 'train.csv'
train_input_data = pd.read_csv(file_path)
# Data cleaning step
# cleaned_train_input_data = train_input_data.dropna(axis=0) # droping or removing the missing values from specified column
# we can use with dropna(axis = 0,inplace = True) which would change the original dataframe 

In [33]:
train_input_data.columns # By this way we can check the columns name in our dataset so we can select which one for features and which one as label data

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [34]:
# Bulding Dataset for model training (x,y)
features_col = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

training_features = train_input_data[features_col]
training_labels = train_input_data.SalePrice

# Define the model

from sklearn.tree import DecisionTreeRegressor
the_model = DecisionTreeRegressor()

# Training Step
the_model.fit(training_features,training_labels)

# Prediction 
pred = the_model.predict(training_features.head())
print (pred)

actual = training_labels.head()
print (actual)

# We use the training dataset as a test set without splitting ( Overfitting)

[208500. 181500. 223500. 140000. 250000.]
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


# Validation Data verses Test Data

In [35]:
# refers to a sample of the dataset held back (reserve) from training the model
# train test split approach
# hyperparameter (validation)
# final evaluating (test)

# Validation set : A set of examples used to tune the parameters of a classifier, for example to choose the number of hidden units in a neural network.

# Test set : A set of examples used only to assess the performance of a fully-specified classifier.

# Train Test Split

In [36]:
from sklearn.model_selection import train_test_split

train_x,val_x,train_y,val_y = train_test_split(training_features,training_labels,random_state=0) # random state make sure we get the same split every time we run this script

model = DecisionTreeRegressor()
model.fit(train_x,train_y)

# Getting prediction ( predictied price ) on validation dataset

val_pred = model.predict(val_x)
print (mean_absolute_error(val_y,val_pred)) # We validate the model using MAE from sklean and we can see the avgerage difference for each sample in our data

32478.07123287671


# Model Optimization

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

the_dataset = pd.read_csv('train.csv')

features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
x = the_dataset[features]
y = the_dataset.SalePrice

train_x,val_x,train_y,val_y = train_test_split(x,y,random_state=1)

# function for getting MAE
def get_mae(branch,train_x,val_x,train_y,val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=branch,random_state = 0)
    model.fit(train_x,train_y)
    val_pred = model.predict(val_x)
    mae = mean_absolute_error(val_y,val_pred)
    return mae

# Evaluating the right tree brach for lower MAE 
# The lower the MAE the more accurate for prediction.
tree_branch = [5,10,25,50,100,150,200,250]
for i in tree_branch:
    the_result = get_mae(i,train_x,val_x,train_y,val_y)
    print ("The MAE is %d \t\t at tree brach of %d" %(the_result,i))


The MAE is 35044 		 at tree brach of 5
The MAE is 31585 		 at tree brach of 10
The MAE is 29016 		 at tree brach of 25
The MAE is 27405 		 at tree brach of 50
The MAE is 27282 		 at tree brach of 100
The MAE is 27417 		 at tree brach of 150
The MAE is 28135 		 at tree brach of 200
The MAE is 27893 		 at tree brach of 250


In [60]:
# This is dictionary comprehension
# We muse define key : value pair for every return output from the loop
# the_optimal_num as key
# get_mae as value

scores = {the_optimal_num: get_mae(the_optimal_num,train_x,val_x,train_y,val_y) for the_optimal_num in tree_branch}

# When we use 
the_best_tree_size = min(scores)
print (the_best_tree_size) # we can see that min() function apply on the key of the dictionary

# We need to get the score which value is the minimum among all of the values
the_best_tree_size = min(scores, key=scores.get)
print (the_best_tree_size)

5
100
