In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

In [62]:
# Import the training data
path = Path("Data/train.csv")
test_path = Path("Data/test.csv")

In [3]:
# create dataframe from this csv
train_df = pd.read_csv(path)
train_df.head(
)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [63]:
test_df = pd.read_csv(test_path)
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [71]:
test_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [4]:
# check for null values within the data set
train_df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [5]:
# fill the null values with 0
#train_df = train_df.fillna(0)

In [6]:
# Use this helper function to assit in dropping null values and use the mean ti fill the gap
def fill_in(data):
    for i in data.columns:
        # if data type is equal to object we leave it alone
        if data[i].dtype == "object":
            data[i] = data[i].fillna("")
        # if data type is a intger or float we want to take the mean of that and fill it in for the null values
        elif (data[i].dtype == "int64" or data[i].dtype == "float64"):
            data[i] = data[i].fillna(data[i].mean())
        else:
            pass
    return data

In [7]:
train_df = fill_in(train_df)

In [72]:
test_df = fill_in(test_df)

In [8]:
# Convert all the data into numerical data
train_df = pd.get_dummies(train_df)

In [73]:
test_df = pd.get_dummies(test_df)

In [9]:
train_df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [10]:
train_df["SalePrice"]

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [11]:
# store the target variable in its own varialbe reshape the data
# reshape the model so the shape has a 1 in it
y = train_df["SalePrice"].values.reshape(-1, 1)

In [12]:
# store the training data into the X variable
X = train_df.drop(columns = ["SalePrice"])
X

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,0,0,1,0,0,0,0,1,0
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,0,0,0,1,0,0,0,0,1,0
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,0,0,1,0,0,0,0,1,0
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,0,0,0,1,0,0,0,0,1,0


In [13]:
# make sure the y shape has a 1 in it
# reshape the model so it is (1460, 1)
y.shape

(1460, 1)

In [14]:
X.shape

(1460, 305)

In [15]:
# split the training data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 78, train_size = .60)
                                                   

# creat a scaler for the model
scaler = StandardScaler()

# scale the training data

X_scaler = scaler.fit(X_train)

# use the X_scaler to scale the test data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# import the relevent metrics to determine how our model did mse, r2, score
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [17]:
## Try using a different algorithim to obtain better results
# use gradient boosting regessor
from sklearn.ensemble import GradientBoostingRegressor


In [18]:
gbr = GradientBoostingRegressor()

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
grid_search = {"n_estimators":[50, 75, 100],
              "learning_rate":[0.01, 0.5, 1],
              "max_depth":[2, 3, 4],
              "min_samples_split": [2, 3, 4],
              "min_samples_leaf": [2, 3, 4]
              }

In [21]:
# Pass the grid_search varaible into GridSearchCV
param_search = GridSearchCV(estimator = gbr, param_grid = grid_search, scoring = "neg_mean_squared_error")

In [22]:
param_search.fit(X_train_scaled, y_train)

GridSearchCV(estimator=GradientBoostingRegressor(),
             param_grid={'learning_rate': [0.01, 0.5, 1],
                         'max_depth': [2, 3, 4], 'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [50, 75, 100]},
             scoring='neg_mean_squared_error')

In [23]:
param_search.best_params_

{'learning_rate': 0.5,
 'max_depth': 2,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 75}

In [24]:
gradient_model = GradientBoostingRegressor(n_estimators = 75, 
                                           learning_rate = .5, 
                                           max_depth = 2, 
                                           min_samples_leaf = 2, 
                                           min_samples_split = 3, 
                                           random_state = 78)

In [25]:
gradient_model.fit(X_train_scaled, y_train)

GradientBoostingRegressor(learning_rate=0.5, max_depth=2, min_samples_leaf=2,
                          min_samples_split=3, n_estimators=75,
                          random_state=78)

In [26]:
predictions = gradient_model.predict(X_test_scaled)

In [27]:
train_score = gradient_model.score(X_train_scaled, y_train, sample_weight = None)
test_score = gradient_model.score(X_test_scaled, y_test, sample_weight = None)

In [28]:
print(f"The training score is {train_score}.")
print(f"The test score is {test_score}.")

The trainning score is 0.9773132469714249.
The test score is 0.8659497114267447.


In [76]:
# scale the training data which is all of the training set
train_scaler = scaler.fit(train_df)

# use the train scaler to scale the testing data
train_scaled = train_scaler.transform(train_df)
test_scaled = train_scaler.transform(test_df)

ValueError: X has 293 features, but StandardScaler is expecting 306 features as input.