In [35]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


# this extension properly formats a cell after it is run
# !{sys.executable} -m pip install nb_black # UNCOMMENT TO INSTALL
%load_ext nb_black
%matplotlib inline

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)

# Set the maximum number of columns to 200
pd.set_option("display.max_columns", 200)

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [36]:
# The csv files were renamed in the last push from stat_train_set to train_set

train_set = pd.read_csv("../data/train_set.csv")
test_set = pd.read_csv("../data/test_set.csv")

<IPython.core.display.Javascript object>

## The below is copied from Creating_test_train with a few changes ##

In [38]:
# creating a function to create groups based on mean sale price of streets in each
# neighborhood of the training df and adds a new column with the groups created
# It then uses the groups created using the training dataframe and creates a new column
# in the testing dataframe by mapping these groups.  This will result in missing values
# If there is a street neighborhood combination that was in the training dataset but 
# not the testing dataset


def group_neighbor_streets_by_saleprice(
    traindf=train_set, testdf=test_set, num_quantiles=10  # notice the difference in this line!!
):
    # Calculate the mean sale price for each street in the training df
    street_prices = traindf.groupby("Neighborhood_st")["SalePrice"].mean()
    # Group the streets into the specified number of quantiles based on sale price
    labels = [f"group_{i+1}" for i in range(num_quantiles)]
    groups = pd.qcut(street_prices, q=num_quantiles, labels=range(1, num_quantiles + 1))
    # Create a dictionary that maps each street name to its corresponding sale price group label
    street_group_dict = dict(zip(street_prices.index, groups))
    # Add a new column to the training dataframe with the street price groups
    traindf["StreetPriceGroup"] = traindf["Neighborhood_st"].map(street_group_dict)
    # Add a new column to the testing dataframe with the street price groups
    testdf["StreetPriceGroup"] = testdf["Neighborhood_st"].map(street_group_dict)
    return street_group_dict


<IPython.core.display.Javascript object>

In [39]:
# the num of quantiles can be changed and it is assigned to d which is the dictionary that
# will be used to fill in the missing values

d = group_neighbor_streets_by_saleprice(
    traindf=train_set,  # the difference here as well!!
    testdf=test_set,  # the difference here as well!!
    num_quantiles=10,
)

<IPython.core.display.Javascript object>

In [40]:
# this will use the dictionary created to fill in the missing values in the test df with
# another group in the same neighborhood


def fill_na(testdf=test_set, d={}):  # DIFFERENT HERE TOO
    # Extract the first part of the string in the "Neighborhood_st" column
    testdf["Neighborhood_prefix"] = testdf["Neighborhood_st"].map(
        lambda x: x.split("_")[0]
    )
    # Create a new dict that only contains the neighborhood
    new_dict = {k.split("_")[0]: v for k, v in d.items()}
    # Create a list of PIDs with missing StreetPriceGroup values
    na_pid_list = testdf[testdf["StreetPriceGroup"].isna()]["PID"].tolist()
    # Create a Boolean mask to filter the DataFrame
    mask = testdf["PID"].isin(na_pid_list)
    # Apply the dictionary mapping only to the filtered rows
    testdf.loc[mask, "StreetPriceGroup"] = testdf[mask]["Neighborhood_prefix"].map(
        new_dict
    )
    # Drop the column since there is no more use for it
    testdf.drop("Neighborhood_prefix", axis=1, inplace=True)


<IPython.core.display.Javascript object>

In [41]:
fill_na(test_set, d)

<IPython.core.display.Javascript object>

In [42]:
# Remove the column that was used to create groupings
train_set.drop("Neighborhood_st", axis=1, inplace=True)
test_set.drop("Neighborhood_st", axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [43]:
# Looks like PID should be removed and SalePrice moved to the first column
train_set

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,Fence,MoSold,YrSold,SaleType,SaleCondition,Age,RemodAge,TotalSF,Remodeled,TotalPorchSF,TotalBath,MSSubClass_cat,Street_type,StreetPriceGroup
0,534477270,1661,165500,80,RL,80.0,9600,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,SLvl,6,6,1955,1996,Hip,CompShg,AsbShng,AsbShng,,0.0,2,2,CBlock,3,3,4,4,831.0,1,0.0,161.0,992.0,GasA,3,Y,SBrkr,1661,0,1.0,0.0,1,0,3,1,3,8,Typ,1,3,BuiltIn,1955.0,2,1.0,377.0,3,3,Y,0,28,0,178,3,10,2008,WD,Normal,53,12,2653.0,1,206,2.0,subclass80,AVE,5
1,903233140,854,132000,45,RM,51.0,6120,Pave,No_Alley,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Norm,Norm,1Fam,1.5Unf,7,8,1929,2001,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,2,2,BrkTil,3,3,1,1,0.0,1,0.0,832.0,832.0,GasA,4,Y,FuseA,854,0,0.0,0.0,1,0,2,1,2,5,Typ,0,0,Detchd,1991.0,1,2.0,576.0,3,3,Y,48,112,0,0,4,7,2007,WD,Normal,78,6,1686.0,1,112,1.0,subclass45,AVE,3
2,923228370,1092,85500,160,RM,21.0,1890,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,6,1972,1972,Gable,CompShg,CemntBd,CmentBd,,0.0,2,2,CBlock,3,3,1,3,294.0,1,0.0,252.0,546.0,GasA,2,Y,SBrkr,546,546,0.0,0.0,1,1,3,1,2,5,Typ,0,0,Attchd,1972.0,1,1.0,286.0,3,3,Y,0,0,64,0,0,6,2010,WD,Normal,38,38,1638.0,0,64,1.5,subclass160,DR,1
3,904101070,3086,200500,50,RL,138.0,18030,Pave,No_Alley,IR1,Bnk,AllPub,Inside,Gtl,ClearCr,Norm,Norm,1Fam,1.5Fin,5,6,1946,1994,Gable,CompShg,MetalSd,MetalSd,,0.0,2,2,CBlock,3,3,1,3,152.0,4,469.0,977.0,1598.0,GasA,2,Y,SBrkr,1636,971,0.0,0.0,3,0,3,1,4,12,Maj1,1,4,No_Garage,-1.0,0,0.0,0.0,0,0,Y,122,0,0,0,3,3,2007,WD,Normal,61,13,4684.0,1,0,3.0,subclass50,RD,7
4,533236040,1441,174000,160,FV,24.0,2645,Pave,Pave,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,Twnhs,2Story,8,5,1999,2000,Gable,CompShg,MetalSd,MetalSd,BrkFace,456.0,3,2,PConc,4,3,1,1,0.0,1,0.0,776.0,776.0,GasA,4,Y,SBrkr,764,677,0.0,0.0,2,1,2,1,3,5,Typ,0,0,Detchd,1999.0,1,2.0,492.0,3,3,Y,206,0,0,0,0,11,2007,WD,Normal,8,7,2217.0,1,0,2.5,subclass160,DR,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2041,904100190,704,88750,20,RL,50.0,4280,Pave,No_Alley,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,1Fam,1Story,4,9,1946,2001,Gable,CompShg,MetalSd,MetalSd,,0.0,2,3,CBlock,2,3,1,1,0.0,1,0.0,560.0,560.0,GasA,4,Y,FuseA,704,0,0.0,1.0,1,0,2,1,1,4,Typ,0,0,CarPort,1946.0,1,1.0,220.0,3,3,Y,0,0,24,0,0,9,2009,WD,Normal,63,8,1264.0,1,24,1.5,subclass20,AVE,1
2042,534128190,2640,256900,60,RL,80.0,10400,Pave,No_Alley,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,PosA,Norm,1Fam,2Story,6,7,1967,1997,Gable,CompShg,MetalSd,MetalSd,BrkFace,256.0,2,2,PConc,3,3,1,1,0.0,1,0.0,932.0,932.0,GasA,3,Y,SBrkr,1271,1369,0.0,0.0,2,1,5,1,3,8,Typ,1,3,Attchd,1967.0,2,2.0,515.0,3,3,Y,0,120,0,168,0,5,2009,WD,Normal,42,12,3572.0,1,288,2.5,subclass60,AVE,8
2043,528142110,1944,360000,20,RL,90.0,12378,Pave,No_Alley,IR1,Lvl,AllPub,Inside,Gtl,NridgHt,Norm,Norm,1Fam,1Story,9,5,2003,2004,Gable,CompShg,VinylSd,VinylSd,,0.0,3,2,PConc,5,3,3,6,1274.0,1,0.0,622.0,1896.0,GasA,4,Y,SBrkr,1944,0,1.0,0.0,2,0,3,1,4,8,Typ,3,5,Attchd,2003.0,3,3.0,708.0,3,3,Y,208,175,0,0,0,11,2006,WD,Normal,3,2,3840.0,1,175,3.0,subclass20,RD,9
2044,534127140,1082,162500,85,RL,0.0,8723,Pave,No_Alley,IR1,Lvl,AllPub,Inside,Gtl,NWAmes,PosN,Norm,1Fam,SFoyer,6,6,1969,1969,Gable,CompShg,HdBoard,HdBoard,,0.0,2,2,PConc,4,3,4,4,973.0,1,0.0,0.0,973.0,GasA,4,Y,SBrkr,1082,0,1.0,0.0,1,0,3,1,2,6,Typ,0,0,Attchd,1969.0,1,2.0,480.0,3,3,Y,160,0,0,0,0,1,2007,WD,Normal,38,38,2055.0,0,0,2.0,subclass85,DR,7


<IPython.core.display.Javascript object>

In [44]:
# overly complicated code to remove PID and move SalePrice to first column
# I need to do this in the future and save the csv's after so we dont have to do this each time
train_set = train_set.iloc[
    :,
    train_set.columns.tolist().index("SalePrice") : (
        train_set.columns.tolist().index("SalePrice") + 1
    ),
].join(train_set.drop(columns=["SalePrice", "PID"]))
# same with test set
test_set = test_set.iloc[
    :,
    test_set.columns.tolist().index("SalePrice") : (
        test_set.columns.tolist().index("SalePrice") + 1
    ),
].join(test_set.drop(columns=["SalePrice", "PID"]))

<IPython.core.display.Javascript object>

In [None]:
train_set.dtypes

## You will now need the opional step below to dummify the columns since ##
## Gadient Boosting doesnt work with strings ##

In [45]:
# The below is an optional step for models that perform better with dummified data
# to be used dummify categorical columns after the functions above are
# used to add the Neighborhood_st groupings



# Creating a list of categorical columns to be dummified
cat_cols = train_set.select_dtypes(include=['object']).columns.tolist()

# Dummify the categorical columns in the train set
train_dummies = pd.get_dummies(train_set, columns=cat_cols)

# Dummify the categorical columns in the test set
test_dummies = pd.get_dummies(test_set, columns=cat_cols)

# Ensure the same dummy columns are present in both train and test sets
train_dummies, test_dummies = train_dummies.align(test_dummies, join='left', axis=1)

# Fill any missing columns in the test set with zeros
test_dummies.fillna(0, inplace=True)


<IPython.core.display.Javascript object>

## The below can be removed and it is just to check if there are strings after columns are dummified ##

In [51]:
# select columns with dtype 'object'
train_strings = train_dummies.select_dtypes(include=["object"]).columns
test_strings = test_dummies.select_dtypes(include=["object"]).columns
print(train_strings)
print(test_strings)

Index([], dtype='object')
Index([], dtype='object')


<IPython.core.display.Javascript object>

## Modified your gradient Boosting Regression to apply above changes ##

In [52]:
gbr = GradientBoostingRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = train_dummies.loc[:, train_dummies.columns != "SalePrice"]
y = train_dummies.loc[:, "SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


<IPython.core.display.Javascript object>

In [53]:
# FIT
gbr.fit(X, y)

GradientBoostingRegressor()

<IPython.core.display.Javascript object>

In [54]:
# k-fold cross-validation


# Step 3: Create the Gradient Boosted Regression model and evaluate it using k-fold cross-validation
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

mse_scores = []
r2_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mse_scores.append(mse)
    r2_scores.append(r2)

print("MSE scores:", mse_scores)
print("R-squared scores:", r2_scores)

MSE scores: [322746634.2791484, 465856217.6223381, 369490124.33649755, 949124370.7377828, 325101453.40383786]
R-squared scores: [0.9352264122721555, 0.921725186226128, 0.9312188870117949, 0.8501043521734962, 0.9400063021503235]


<IPython.core.display.Javascript object>

## The output shows the MSE and R-squared scores for each fold of the cross-validation. The MSE scores range from 322746634.2791484 to 949124370.7377828, while the R-squared scores range from 0.8501043521734962 to 0.9400063021503235. These scores indicate how well the model is performing on the testing data, with higher R-squared scores and lower MSE scores indicating better performance. The fact that the R-squared scores are relatively high and the MSE scores are relatively low suggests that the model is performing well on the data. ##

## Your code is below if you want to look at the changes that were made: ##

## Fitting GradientBoostingRegression

In [9]:
gbr = GradientBoostingRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = ames_train.loc[:, ames_train.columns != "column_to_exclude"]
y = ames_train.loc[:, ames_train.columns == "column to include"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

<IPython.core.display.Javascript object>

In [10]:
# FIT
gbr.fit(X, y)

ValueError: could not convert string to float: 'RL'

<IPython.core.display.Javascript object>

In [None]:
# k-fold cross-validation


# Step 3: Create the Gradient Boosted Regression model and evaluate it using k-fold cross-validation
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

mse_scores = []
r2_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mse_scores.append(mse)
    r2_scores.append(r2)

print("MSE scores:", mse_scores)
print("R-squared scores:", r2_scores)
