In [1]:
# Initial imports
import pandas as pd
import hvplot.pandas
from sklearn.linear_model import LinearRegression
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import skew
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error

import seaborn as sns
import holoviews as hv

train_path = 'Resources/train.csv'
test_path = 'Resources/test.csv'
trained_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)


In [2]:
from pyspark.sql import SparkSession
# Create a Spark session
spark = SparkSession.builder \
    .appName("YourAppNameHere") \
    .getOrCreate()

# Load the training and test data as Spark DataFrames
train_path = 'Resources/train.csv'
test_path = 'Resources/test.csv'

# You can specify the format and options if needed
train_data = spark.read.csv(train_path, header=True, inferSchema=True)
test_data = spark.read.csv(test_path, header=True, inferSchema=True)

# You can perform various Spark operations on these DataFrames
# For example, to display the first few rows of the training data
train_data.show()



+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition

In [3]:
test_data.show()

+----+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+
|  Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition2|BldgTy

In [4]:
print(trained_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
trained_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
columns_to_remove = ['FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType', 'Alley']

In [7]:
X = trained_data.copy()
X = X.loc[(X['YrSold'] >= 2006) & (X['YrSold'] <= 2010)] #not necessary, the dataset is already 2006-2010
X.drop(columns=columns_to_remove, inplace=True, axis=1)
Xf = X.dropna()
Xf.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [8]:
dummies = pd.get_dummies(Xf, columns=['MSZoning','LotConfig','LotFrontage','Street','LotShape','LandContour',
                       'Utilities','LandContour','LandSlope','Neighborhood',
                       'Condition1','Condition2','BldgType','HouseStyle','SaleType','RoofStyle',
                      'RoofMatl','Exterior1st','Exterior2nd','ExterQual','ExterCond',
                      'Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                        'BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual',
                 'Functional','GarageType','GarageFinish','GarageQual',
                  'GarageCond','PavedDrive','SaleType','SaleCondition'], dtype=float)
dummies

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,8450,7,5,2003,2003,196.0,706,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20,9600,6,8,1976,1976,0.0,978,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60,11250,7,5,2001,2002,162.0,486,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70,9550,7,5,1915,1970,0.0,216,0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60,14260,8,5,2000,2000,350.0,655,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,7917,6,5,1999,2000,0.0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1456,1457,20,13175,6,6,1978,1988,119.0,790,163,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1457,1458,70,9042,7,9,1941,2006,0.0,275,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1458,1459,20,9717,5,6,1950,1996,0.0,49,1029,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# Define target vector
y = Xf["SalePrice"].ravel()
y[:5]

array([208500, 181500, 223500, 140000, 250000], dtype=int64)

In [10]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(dummies, y, random_state=78)


In [11]:
# Create the StandardScaler instance
scaler = StandardScaler()


In [12]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)


In [13]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [14]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) #500 decision trees putting a vote
rf_model

In [15]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [16]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [17]:
# Get the feature importance array
importances = rf_model.feature_importances_

In [18]:
# List the top 10 most important features
sorted(zip(rf_model.feature_importances_, X.columns[1:74]), reverse=True)

[(0.03750765996833717, 'Heating'),
 (0.025706363360004723, 'MSSubClass'),
 (0.024825549011448865, 'OverallQual'),
 (0.02437659956591732, 'Foundation'),
 (0.024210708923580435, 'Condition2'),
 (0.02376271606316851, 'LotFrontage'),
 (0.023164891970868382, 'Condition1'),
 (0.0230456082757239, 'Neighborhood'),
 (0.020744002930578688, 'ExterQual'),
 (0.02022417635358984, 'LotShape'),
 (0.020027281140110677, 'BsmtUnfSF'),
 (0.020011099896513982, 'LandContour'),
 (0.019103574035868726, 'LotConfig'),
 (0.017083688488116033, 'BsmtCond'),
 (0.01584390352205448, 'BsmtQual'),
 (0.015116499647095443, 'TotalBsmtSF'),
 (0.015106243096804224, 'Exterior2nd'),
 (0.014203415593575826, 'Utilities'),
 (0.014095449794237401, 'BldgType'),
 (0.01265088365141831, 'LotArea'),
 (0.01018537779363353, 'RoofMatl'),
 (0.010012557293500706, 'MSZoning'),
 (0.009897688913151799, 'Street'),
 (0.008853728614390181, 'MasVnrArea'),
 (0.007464253352539215, 'OverallCond'),
 (0.006607740379641935, 'ExterCond'),
 (0.0062922393

In [19]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)  # X_test is your test set features

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)  # y_test is the actual target values from the test set

print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 114285.01459854015




In [20]:
# Features to remove based on low importance scores
features_to_remove = [
    'Street', 'MasVnrArea', 'OverallCond', 'ExterCond', 'BsmtExposure',
    'RoofStyle', 'YearRemodAdd', 'LandSlope', 'YearBuilt', 'HouseStyle', 'Exterior1st','Id'
]

# Drop the low-importance features from the dummies DataFrame
dummies = Xf.drop(columns=features_to_remove)
# Define target vector
y = Xf["SalePrice"].ravel()
y[:5]
dummies = pd.get_dummies(dummies, columns=['MSZoning','LotConfig','LotFrontage','LotShape','LandContour',
                       'Utilities','LandContour','Neighborhood',
                       'Condition1','Condition2','BldgType','SaleType',
                      'RoofMatl','Exterior2nd','ExterQual',
                      'Foundation','BsmtQual','BsmtCond','BsmtFinType1',
                        'BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual',
                 'Functional','GarageType','GarageFinish','GarageQual',
                  'GarageCond','PavedDrive','SaleType','SaleCondition'], dtype=float)


# Split your data into train and test sets (assuming you have them)
X_train, X_test, y_train, y_test = train_test_split(dummies, y, test_size=0.2, random_state=42)

# Train a Random Forest model on the updated dataset
rf_model_updated = RandomForestRegressor(random_state=42)
rf_model_updated.fit(X_train, y_train)

# Make predictions on the test set
y_pred_updated = rf_model_updated.predict(X_test)

# Calculate the Mean Absolute Error (MAE) for the updated model
mae_updated = mean_absolute_error(y_test, y_pred_updated)
print(f"Updated Model MAE: {mae_updated}")


Updated Model MAE: 932.6097716894977


In [21]:
# List the top 10 most important features with their importance scores
feature_importances = list(zip(rf_model_updated.feature_importances_, dummies.columns))
top_10_features = sorted(feature_importances, reverse=True)[:80]

# Print the top 10 most important features in a similar format
print("# List the top 10 most important features")
for i, (importance, feature) in enumerate(top_10_features, start=1):
    print(f"{i}. {feature}: {importance:.6f}")


# List the top 10 most important features
1. SalePrice: 0.994882
2. 2ndFlrSF: 0.000552
3. 1stFlrSF: 0.000546
4. PoolArea: 0.000383
5. SaleCondition_Abnorml: 0.000380
6. GrLivArea: 0.000373
7. LotConfig_Corner: 0.000346
8. LotFrontage_160.0: 0.000315
9. Neighborhood_NoRidge: 0.000282
10. OpenPorchSF: 0.000186
11. Fireplaces: 0.000148
12. TotalBsmtSF: 0.000147
13. Neighborhood_StoneBr: 0.000123
14. MoSold: 0.000091
15. BsmtFinSF1: 0.000085
16. LotArea: 0.000072
17. ScreenPorch: 0.000067
18. BsmtUnfSF: 0.000060
19. GarageArea: 0.000057
20. ExterQual_Ex: 0.000056
21. Exterior2nd_ImStucc: 0.000054
22. LotShape_Reg: 0.000052
23. TotRmsAbvGrd: 0.000039
24. GarageYrBlt: 0.000036
25. LotFrontage_50.0: 0.000036
26. WoodDeckSF: 0.000036
27. LotConfig_CulDSac: 0.000035
28. BedroomAbvGr: 0.000034
29. Foundation_BrkTil: 0.000033
30. YrSold: 0.000027
31. GarageType_BuiltIn: 0.000024
32. FullBath: 0.000024
33. GarageQual_Ex: 0.000023
34. SaleType_New: 0.000018
35. LotFrontage_63.0: 0.000018
36. LotSha

In [22]:
# Features to remove based on low importance scores
features_to_remove = [
     'Exterior1st',
    'Id', 'GarageYrBlt', 'WoodDeckSF', 'GarageQual', 'OpenPorchSF', 'GarageFinish',
    'CentralAir', 'BsmtFullBath', 'GarageCars', 'HouseStyle', 'MoSold', 'Exterior1st',
    'YrSold', 'BedroomAbvGr', 'MiscVal', 'SaleCondition', 'Electrical',
    'GarageCond', 'PavedDrive', 'Fireplaces', 'GarageArea', 'TotRmsAbvGrd',
    'KitchenAbvGr', 'HalfBath', 'Functional', 'BsmtFinType2', 'EnclosedPorch',
    'SaleType', 'HeatingQC', 'BsmtHalfBath', 'GarageType',
    'ScreenPorch', 'GarageFinish','BsmtFinSF2']


# Drop the low-importance features from the dummies DataFrame
dummies = Xf.drop(columns=features_to_remove)
# Define target vector
y = Xf["SalePrice"].ravel()
y[:5]
dummies = pd.get_dummies(dummies, columns=['MSZoning','LotConfig','LotFrontage','LotShape','LandContour',
                       'Utilities','LandContour','Neighborhood','RoofStyle','KitchenQual',
                       'Condition1','Condition2','BldgType','ExterCond','LandSlope',
                      'RoofMatl','Exterior2nd','ExterQual','MasVnrArea','Street',
                      'Foundation','BsmtQual','BsmtCond','BsmtFinType1','Heating','BsmtExposure'
                  ], dtype=float)


# Split your data into train and test sets (assuming you have them)
X_train, X_test, y_train, y_test = train_test_split(dummies, y, test_size=0.2, random_state=42)

# Train a Random Forest model on the updated dataset
rf_model_updated = RandomForestRegressor(random_state=42)
rf_model_updated.fit(X_train, y_train)

# Make predictions on the test set
y_pred_updated = rf_model_updated.predict(X_test)

# Calculate the Mean Absolute Error (MAE) for the updated model
mae_updated = mean_absolute_error(y_test, y_pred_updated)
print(f"Updated Model MAE: {mae_updated}")



Updated Model MAE: 932.2128310502286


In [23]:
# Features to remove based on low importance scores
features_to_remove = [
     'YearRemodAdd', 'YearBuilt', 'HouseStyle', 'Exterior1st',
    'Id', 'GarageYrBlt', 'WoodDeckSF', 'GarageQual', 'OpenPorchSF', 'GarageFinish',
    'CentralAir', 'BsmtFullBath', 'GarageCars', 'HouseStyle', 'MoSold',
    'YrSold', 'BedroomAbvGr', 'MiscVal', 'SaleCondition', '3SsnPorch', 'Electrical',
    'GarageCond', 'PavedDrive', 'Fireplaces', 'GarageArea', 'TotRmsAbvGrd',
    'KitchenAbvGr', 'HalfBath', 'Functional', 'BsmtFinType2', 'EnclosedPorch',
    'SaleType', 'HeatingQC', 'BsmtHalfBath', 'GarageType','MasVnrArea','BsmtExposure',
    'ScreenPorch', 'GarageFinish','BsmtFinSF2']
#'Street', 'MasVnrArea*', 'OverallCond*', 'ExterCond', 'BsmtExposure*','YearRemodAdd*'
#    'RoofStyle', 'LandSlope', 'YearBuilt', 'HouseStyle', 'Exterior1st','Id'

# Drop the low-importance features from the dummies DataFrame
dummies = Xf.drop(columns=features_to_remove)
# Define target vector
y = Xf["SalePrice"].ravel()
y[:5]
dummies = pd.get_dummies(dummies, columns=['MSZoning','LotConfig','LotFrontage','LotShape','LandContour',
                       'Utilities','LandContour','Neighborhood','RoofStyle','KitchenQual',
                       'Condition1','Condition2','BldgType','ExterCond','LandSlope',
                      'RoofMatl','Exterior2nd','ExterQual','Street',
                      'Foundation','BsmtQual','BsmtCond','BsmtFinType1','Heating'
                  ], dtype=float)


# Split your data into train and test sets (assuming you have them)
X_train, X_test, y_train, y_test = train_test_split(dummies, y, test_size=0.2, random_state=74)

# Train a Random Forest model on the updated dataset
rf_model_updated = RandomForestRegressor(random_state=42)
rf_model_updated.fit(X_train, y_train)

# Make predictions on the test set
y_pred_updated = rf_model_updated.predict(X_test)

# Calculate the Mean Absolute Error (MAE) for the updated model
mae_updated = mean_absolute_error(y_test, y_pred_updated)
print(f"Updated Model MAE: {mae_updated}")



Updated Model MAE: 264.4312328767123


In [24]:
# List the top 10 most important features with their importance scores
feature_importances = list(zip(rf_model_updated.feature_importances_, dummies.columns))
top_10_features = sorted(feature_importances, reverse=True)[:80]

# Print the top 10 most important features in a similar format
print("# List the top 10 most important features")
for i, (importance, feature) in enumerate(top_10_features, start=1):
    print(f"{i}. {feature}: {importance:.6f}")


# List the top 10 most important features
1. SalePrice: 0.992742
2. GrLivArea: 0.001646
3. LotConfig_Corner: 0.001383
4. Neighborhood_NoRidge: 0.000875
5. 2ndFlrSF: 0.000692
6. 1stFlrSF: 0.000648
7. TotalBsmtSF: 0.000231
8. RoofStyle_Gable: 0.000168
9. Exterior2nd_ImStucc: 0.000156
10. RoofMatl_CompShg: 0.000142
11. BsmtFinSF1: 0.000127
12. LotFrontage_104.0: 0.000123
13. LotArea: 0.000117
14. LotFrontage_160.0: 0.000117
15. BsmtUnfSF: 0.000080
16. Neighborhood_StoneBr: 0.000078
17. RoofStyle_Hip: 0.000070
18. MSSubClass: 0.000051
19. PoolArea: 0.000043
20. OverallCond: 0.000040
21. LotShape_Reg: 0.000039
22. Exterior2nd_CmentBd: 0.000036
23. LotFrontage_50.0: 0.000035
24. BsmtFinType1_Unf: 0.000030
25. ExterQual_Gd: 0.000024
26. FullBath: 0.000021
27. BsmtFinType1_GLQ: 0.000019
28. LotConfig_CulDSac: 0.000019
29. MSZoning_C (all): 0.000018
30. LotShape_IR1: 0.000017
31. OverallQual: 0.000014
32. LotFrontage_63.0: 0.000013
33. LandContour_HLS: 0.000010
34. LotFrontage_107.0: 0.000010
3

In [25]:
best_mae = float('inf')  # Initialize with a high value
best_random_state = None

for random_state in range(100):  # Try different random states
    # Split your data into train and test sets using the current random state
    X_train, X_test, y_train, y_test = train_test_split(dummies, y, test_size=0.2, random_state=random_state)

    # Train a Random Forest model on the updated dataset
    rf_model_updated = RandomForestRegressor(random_state=42)
    rf_model_updated.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_updated = rf_model_updated.predict(X_test)

    # Calculate the Mean Absolute Error (MAE) for the updated model
    mae_updated = mean_absolute_error(y_test, y_pred_updated)
    
    # Check if this MAE is better than the current best
    if mae_updated < best_mae:
        best_mae = mae_updated
        best_random_state = random_state

print(f"Best Random State: {best_random_state}")
print(f"Best Model MAE: {best_mae}")


KeyboardInterrupt: 

In [26]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="LotConfig",
    y="SalePrice",
    title="Home Sale Price Based on LotConfig"
    
).opts(xrotation=45)

home_sale_plot


In [27]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.scatter(
    x="GrLivArea",
    y="SalePrice",
    title="Home Sale Price Based on Living Area Size"
    
).opts(xrotation=45)

home_sale_plot


In [42]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.scatter(
    x="2ndFlrSF",
    y="SalePrice",
    title="Home Sale Price Based on second floor size"
    
).opts(xrotation=45)

home_sale_plot


In [44]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.scatter(
    x="TotalBsmtSF",
    y="SalePrice",
    title="Home Sale Price Based on Basement Size"
    
).opts(xrotation=45)

home_sale_plot


In [43]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.scatter(
    x="1stFlrSF",
    y="SalePrice",
    title="Home Sale Price Based on first floor size"
    
).opts(xrotation=45)

home_sale_plot


In [46]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="RoofStyle",
    y="SalePrice",
    title="Home Sale Price Based on RoofStyle"
    
)

home_sale_plot


In [28]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="Neighborhood",
    y="SalePrice",
    title="Home Sale Price Based on Neighborhood"
    
).opts(xrotation=45)

home_sale_plot


In [29]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="MSSubClass",
    y="SalePrice",
    title="Home Sale Price Based on Building Type"
)
home_sale_plot


In [30]:
trained_data_sorted = trained_data.sort_values(by="OverallQual", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="OverallQual",
    y="SalePrice",
    title="Home Sale Price Based on Overall Quality"
)
home_sale_plot


In [31]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="Foundation",
    y="SalePrice",
    title="Home Sale Price Based on Foundation"
)
home_sale_plot


In [32]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="Condition2",
    y="SalePrice",
    title="Home Sale Price Based on Proximity to various conditions"
)
home_sale_plot


In [33]:
trained_data_cleaned = trained_data.dropna(subset = ['LotFrontage', 'SalePrice'])

In [34]:
trained_data_cleaned.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [35]:
#trained_data_sorted = trained_data.sort_values(by="LotFrontage", ascending=False)

home_sale_plot = trained_data_cleaned.hvplot.scatter(
    x="LotFrontage",
    y="SalePrice",
    title="Home Sale Price Based on Lot"
)
home_sale_plot


In [36]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="Condition1",
    y="SalePrice",
    title="Home Sale Price Based on proximity to main road or railroad"
)
home_sale_plot


In [37]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="Neighborhood",
    y="SalePrice",
    title="Home Sale Price Based on Neighborhood",
)
home_sale_plot.opts(xrotation=45)

home_sale_plot



In [38]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="ExterQual",
    y="SalePrice",
    title="Home Sale Price Based on Exterior Quality",
)

home_sale_plot


In [39]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="LotShape",
    y="SalePrice",
    title="Home Sale Price Based on Shape of Property",
)

home_sale_plot


In [40]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="LandContour",
    y="SalePrice",
    title="Home Sale Price Based on Flatness of property",
)

home_sale_plot


In [41]:
trained_data_sorted = trained_data.sort_values(by="SalePrice", ascending=False)

home_sale_plot = trained_data_sorted.hvplot.bar(
    x="Heating",
    y="SalePrice",
    title="Home Sale Price Based on Heating type",
)

home_sale_plot
