In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [130]:
# Read in training data
train_data = Path("data/train.csv")
train_df = pd.read_csv(train_data, delimiter=",", index_col='Id')


In [132]:
train_df = pd.get_dummies(train_df, dummy_na=True)
train_df = train_df.fillna('0')
train_df

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65,8450,7,5,2003,2003,196,706,0,...,0,1,0,0,0,0,0,1,0,0
2,20,80,9600,6,8,1976,1976,0,978,0,...,0,1,0,0,0,0,0,1,0,0
3,60,68,11250,7,5,2001,2002,162,486,0,...,0,1,0,0,0,0,0,1,0,0
4,70,60,9550,7,5,1915,1970,0,216,0,...,0,1,0,1,0,0,0,0,0,0
5,60,84,14260,8,5,2000,2000,350,655,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,62,7917,6,5,1999,2000,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1457,20,85,13175,6,6,1978,1988,119,790,163,...,0,1,0,0,0,0,0,1,0,0
1458,70,66,9042,7,9,1941,2006,0,275,0,...,0,1,0,0,0,0,0,1,0,0
1459,20,68,9717,5,6,1950,1996,0,49,1029,...,0,1,0,0,0,0,0,1,0,0


In [133]:
# Create the features (X) and target (y) sets
X = train_df.drop(['SalePrice'], axis=1)
y = train_df["SalePrice"]

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [135]:
# Creating StandardScaler instance
scaler = sklearn.preprocessing.StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [136]:
max_feat = round(len(train_df.columns)*.75)

In [137]:
max_feat

249

In [138]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    regressor = GradientBoostingRegressor(n_estimators=200,
                                            learning_rate=learning_rate,
                                            max_features=max_feat,
                                            max_depth=5,
                                            random_state=0)

    # Fit the model
    regressor.fit(X_train_scaled, y_train.ravel())
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.5f}".format(
        regressor.score(
            X_train_scaled,
            y_train.ravel())))
    print("Accuracy score (test): {0:.5f}".format(
        regressor.score(
            X_test_scaled,
            y_test.ravel())))
    print()

Learning rate:  0.05
Accuracy score (training): 0.99424
Accuracy score (test): 0.88930

Learning rate:  0.1
Accuracy score (training): 0.99783
Accuracy score (test): 0.89690

Learning rate:  0.25
Accuracy score (training): 0.99991
Accuracy score (test): 0.88958

Learning rate:  0.5
Accuracy score (training): 1.00000
Accuracy score (test): 0.88842

Learning rate:  0.75
Accuracy score (training): 1.00000
Accuracy score (test): 0.83399

Learning rate:  1
Accuracy score (training): 1.00000
Accuracy score (test): 0.81782



In [139]:
# Choose a learning rate and create classifier
regressor = GradientBoostingRegressor(n_estimators=200,
                                        learning_rate=0.1,
                                        max_features=5,
                                        max_depth=5,
                                        random_state=0)

# Fit the model
regressor.fit(X_train_scaled, y_train.ravel())

# Make Prediction
predictions = regressor.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).tail(20)

Unnamed: 0,Prediction,Actual
345,166067.479779,179000
346,212396.083887,226000
347,254196.827152,222000
348,133920.185048,134500
349,94949.825886,101800
350,106370.59283,110000
351,198290.122995,197000
352,100912.030387,113000
353,127170.606581,143750
354,103097.739612,100000


In [155]:
test_data = Path("data/test.csv")
test_df = pd.read_csv(test_data, delimiter=",", index_col='Id')
test_df = pd.get_dummies(test_df, dummy_na=True)
test_df = test_df.fillna('0')
test_df.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,80,11622,5,6,1961,1961,0,468,144,...,0,1,0,0,0,0,0,1,0,0
1462,20,81,14267,6,6,1958,1958,108,923,0,...,0,1,0,0,0,0,0,1,0,0
1463,60,74,13830,5,5,1997,1998,0,791,0,...,0,1,0,0,0,0,0,1,0,0
1464,60,78,9978,6,6,1998,1998,20,602,0,...,0,1,0,0,0,0,0,1,0,0
1465,120,43,5005,8,5,1992,1992,0,263,0,...,0,1,0,0,0,0,0,1,0,0


In [159]:
test_scaled = X_scaler.transform(test_df)

In [160]:
# Make Prediction
final_predictions = regressor.predict(test_scaled)
final = pd.DataFrame({"id":test_df.index,"SalePrice": final_predictions})

In [161]:
final

Unnamed: 0,id,SalePrice
0,1461,104991.796437
1,1462,147982.992545
2,1463,127350.217023
3,1464,155435.522379
4,1465,155312.865637
...,...,...
1454,2915,69917.659553
1455,2916,83329.288797
1456,2917,126671.029375
1457,2918,99374.823636


In [162]:
final.to_csv('submission.csv', index= False)

In [None]:
X_train

In [None]:
train_df

In [None]:
train_not_test = test_cols.difference(train_cols)

In [None]:
train_cols = train_df.columns
test_cols = test_df.columns

In [None]:
train_not_test

In [None]:
train_cols

In [None]:
test_cols

In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

In [53]:
train_df.columns.to_series().groupby(train_df.dtypes).groups

{dtype('uint8'): Index(['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
        'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_0', 'Alley_Grvl',
        'Alley_Pave',
        ...
        'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
        'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
        'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
        'SaleCondition_Partial'],
       dtype='object', length=268),
 dtype('int64'): Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
        'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
        'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
        'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
        'EnclosedPorch', '3SsnPorch', 'Scr

In [54]:
a = train_df.columns.to_series().groupby(test_df.dtypes).groups

In [70]:
print(a[dtype('uint8')])

NameError: name 'dtype' is not defined

In [44]:
wrong_types_int64 = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                    'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea','LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [22]:
# wrong_types_float64 = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [50]:
for wrong in wrong_types_int64:
    test_df =test_df.astype({wrong: 'int64'})

In [47]:
for wrong in wrong_types_int64:
    train_df =train_df.astype({wrong: 'int64'})

In [40]:
for wrong2 in wrong_types_float64:
    test_df = test_df.astype({wrong2: 'float64'})

In [91]:
test_df = pd.get_dummies(test_df)
test_df.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,1,0,0,0,0,1,0
1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [52]:
test_df.columns.to_series().groupby(test_df.dtypes).groups

{dtype('uint8'): Index(['MSZoning_0', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH',
        'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_0',
        'Alley_Grvl',
        ...
        'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
        'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
        'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
        'SaleCondition_Partial'],
       dtype='object', length=256),
 dtype('int64'): Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
        'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
        'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
        'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
        'EnclosedPorch', '3SsnPorch', 'Scr

In [None]:
wrong_types2 = ['GarageCars', 'GarageArea']

In [76]:
mergedStuff = pd.merge(test_df, train_df, how='outer')
mergedStuff.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Exterior1st_ImStucc,Exterior1st_Stone,Exterior2nd_Other,Heating_Floor,Heating_OthW,Electrical_0,Electrical_Mix,GarageQual_Ex,PoolQC_Fa,MiscFeature_TenC
0,20,80,11622,5,6,1961,1961,0,468,144,...,,,,,,,,,,
1,20,81,14267,6,6,1958,1958,108,923,0,...,,,,,,,,,,
2,60,74,13830,5,5,1997,1998,0,791,0,...,,,,,,,,,,
3,60,78,9978,6,6,1998,1998,20,602,0,...,,,,,,,,,,
4,120,43,5005,8,5,1992,1992,0,263,0,...,,,,,,,,,,


In [156]:
missing_columns = list(set(X_train.columns).difference(set(test_df.columns)))

In [157]:
for miss in missing_columns:
    test_df[miss] = 0 

In [158]:
test_df

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,RoofMatl_ClyTile,PoolQC_Fa,Utilities_NoSeWa,HouseStyle_2.5Fin,Condition2_RRNn,Condition2_RRAe,RoofMatl_Membran,MiscFeature_TenC,Condition2_RRAn,Exterior1st_ImStucc
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,80,11622,5,6,1961,1961,0,468,144,...,0,0,0,0,0,0,0,0,0,0
1462,20,81,14267,6,6,1958,1958,108,923,0,...,0,0,0,0,0,0,0,0,0,0
1463,60,74,13830,5,5,1997,1998,0,791,0,...,0,0,0,0,0,0,0,0,0,0
1464,60,78,9978,6,6,1998,1998,20,602,0,...,0,0,0,0,0,0,0,0,0,0
1465,120,43,5005,8,5,1992,1992,0,263,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,21,1936,4,7,1970,1970,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2916,160,21,1894,4,5,1970,1970,0,252,0,...,0,0,0,0,0,0,0,0,0,0
2917,20,160,20000,5,7,1960,1996,0,1224,0,...,0,0,0,0,0,0,0,0,0,0
2918,85,62,10441,5,5,1992,1992,0,337,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
len(set(train_df.columns).difference(set(test_df.columns)))

19

In [144]:
set(test_df.columns).difference(set(train_df.columns))

set()

In [153]:
X_train

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1230,80,70,7910,5,5,1960,1960,0,666,0,...,0,1,0,0,0,0,0,1,0,0
73,60,74,10141,7,5,1998,1998,40,0,0,...,0,1,0,0,0,0,0,1,0,0
879,85,88,11782,5,7,1961,1995,0,899,0,...,0,1,0,0,0,0,0,1,0,0
265,30,30,5232,5,5,1925,2004,0,0,0,...,0,1,0,0,0,0,0,1,0,0
407,50,51,10480,6,5,1936,1950,0,0,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,60,134,19378,7,5,2005,2006,456,57,0,...,0,0,0,0,0,0,0,0,1,0
875,50,52,5720,5,6,1941,1950,0,0,0,...,0,1,0,1,0,0,0,0,0,0
281,60,82,11287,7,6,1989,1989,340,421,0,...,0,1,0,0,0,0,0,1,0,0
801,60,79,12798,6,5,1997,1997,0,462,0,...,0,1,0,0,0,0,0,1,0,0
