In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import numpy as np

filepath = './CleanData/NYC_MLdata.csv'
data = pd.read_csv(filepath)


In [13]:
#Rename column
data.rename(columns={'New Building Class Category': 'BUILDING CLASS'}, inplace=True)

#Use Label Encoder to transform categories into numbers
print(data['BUILDING CLASS'].unique())
label_encoder = LabelEncoder()
data['BUILDING CLASS'] = label_encoder.fit_transform(data['BUILDING CLASS'])
print(data['BUILDING CLASS'].unique())

print(data.info())

['A' 'B' 'C' 'D' 'R']
[0 1 2 3 4]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55742 entries, 0 to 55741
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   BOROUGH                    55742 non-null  int64  
 1   ZIP CODE                   55742 non-null  float64
 2   RESIDENTIAL UNITS          55742 non-null  float64
 3   COMMERCIAL UNITS           55742 non-null  float64
 4   TOTAL UNITS                55742 non-null  float64
 5   LAND SQUARE FEET           55742 non-null  float64
 6   GROSS SQUARE FEET          55742 non-null  float64
 7   TAX CLASS AT TIME OF SALE  55742 non-null  int64  
 8   SALE PRICE                 55742 non-null  int64  
 9   SALE PRICE_log             55742 non-null  float64
 10  LAND SQUARE FEET_log       55742 non-null  float64
 11  GROSS SQUARE FEET_log      55742 non-null  float64
 12  BUILDING CLASS             55742 non-null  int32  
 13  BUILDING AGE

In [14]:
# Select the categorical columns
categorical_columns = ['BOROUGH', 'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS']

# Create a new DataFrame with the selected columns
data_categorical = data[categorical_columns]

# Perform one-hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
encoded_features = onehot_encoder.fit_transform(data_categorical)

# Create a DataFrame from the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# Concatenate the encoded DataFrame with the original data
data_encoded = pd.concat([data.drop(columns=categorical_columns), encoded_df], axis=1)

# Print the updated DataFrame information
print(data_encoded.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55742 entries, 0 to 55741
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ZIP CODE                     55742 non-null  float64
 1   RESIDENTIAL UNITS            55742 non-null  float64
 2   COMMERCIAL UNITS             55742 non-null  float64
 3   TOTAL UNITS                  55742 non-null  float64
 4   LAND SQUARE FEET             55742 non-null  float64
 5   GROSS SQUARE FEET            55742 non-null  float64
 6   SALE PRICE                   55742 non-null  int64  
 7   SALE PRICE_log               55742 non-null  float64
 8   LAND SQUARE FEET_log         55742 non-null  float64
 9   GROSS SQUARE FEET_log        55742 non-null  float64
 10  BUILDING AGE                 55742 non-null  int64  
 11  BOROUGH_1                    55742 non-null  float64
 12  BOROUGH_2                    55742 non-null  float64
 13  BOROUGH_3       

In [15]:
#Splitting data into train/test split

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,r2_score

X = data_encoded.drop(columns=['SALE PRICE', 'SALE PRICE_log', 'GROSS SQUARE FEET_log', 'LAND SQUARE FEET_log'])  # Drop the target column from the features
y = data_encoded['SALE PRICE']  # Extract the target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform cross-validation on a linear regression model
from sklearn.linear_model import LinearRegression

model = LinearRegression()
#5 fold cross validation
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Calculate RMSE from negative MSE scores
rmse_scores = np.sqrt(-scores)


# Print the cross-validation scores
print("Cross-Validation Scores:")
print(rmse_scores)
print("Mean RMSE:", rmse_scores.mean())


# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)


Cross-Validation Scores:
[2604614.83731608 2599024.76034581 2822905.0472763  2682220.51429791
 2701326.53983896]
Mean RMSE: 2682018.33981501
R-squared: 0.5560344759287444


In [16]:
#Try scaling X_train using standardscalar

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

model = LinearRegression()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)


RMSE: 2690560.4393006493
R-squared: 0.11766180649612568


In [17]:
#Testing for polynomial regression
from sklearn.preprocessing import PolynomialFeatures


X = data_encoded.drop(columns=['SALE PRICE', 'SALE PRICE_log', 'GROSS SQUARE FEET_log', 'LAND SQUARE FEET_log'])  # Drop the target column from the features
y = data_encoded['SALE PRICE']  # Extract the target column

poly_features = PolynomialFeatures(degree=2)  # Set the degree of the polynomial
X_poly = poly_features.fit_transform(X)  # Transform the input features to polynomial features

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)


# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)

#slightly better but not much than linear model.

RMSE: 2567104.756271735
R-squared: 0.19677585112650975


In [18]:
#Random Forest + Cross Validation

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

X = data_encoded.drop(columns=['SALE PRICE', 'SALE PRICE_log', 'GROSS SQUARE FEET_log', 'LAND SQUARE FEET_log'])
y = data_encoded['SALE PRICE']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()

# Perform cross-validation
#scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Calculate the RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)


# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)


# Retrieve the feature importances
importances = model.feature_importances_

# Create a dataframe to display the feature importances
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_importances.sort_values(by='Importance', ascending=False, inplace=True)

# Print the feature importances
print(feature_importances)

# Find the feature with the highest variance
highest_variance_feature = feature_importances['Feature'].iloc[0]
print(f"The feature with the highest variance is: {highest_variance_feature}")




Root Mean Squared Error: 1723583.4582745591
R-squared: 0.6379119357770606
                        Feature  Importance
5             GROSS SQUARE FEET    0.282606
0                      ZIP CODE    0.189567
4              LAND SQUARE FEET    0.157737
6                  BUILDING AGE    0.123315
3                   TOTAL UNITS    0.087493
1             RESIDENTIAL UNITS    0.077107
18             BUILDING CLASS_3    0.025085
9                     BOROUGH_3    0.012009
15             BUILDING CLASS_0    0.009014
2              COMMERCIAL UNITS    0.008907
19             BUILDING CLASS_4    0.005763
10                    BOROUGH_4    0.005516
13  TAX CLASS AT TIME OF SALE_2    0.004892
14  TAX CLASS AT TIME OF SALE_4    0.003200
17             BUILDING CLASS_2    0.002290
12  TAX CLASS AT TIME OF SALE_1    0.001985
8                     BOROUGH_2    0.001860
7                     BOROUGH_1    0.000827
16             BUILDING CLASS_1    0.000753
11                    BOROUGH_5    0.000073
Th

In [22]:
#Training XGBoost Model
import xgboost as xgb

# Create an XGBoost Regressor model
xgb_model = xgb.XGBRegressor()

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Calculate the mean squared error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("XGBoost MSE:", rmse)


# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)

#XGBOOst produces result as good as Random Forest

XGBoost MSE: 1773555.3341575922
R-squared: 0.6166115192734376


In [20]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler


# Create the KNN regression model
model = KNeighborsRegressor(n_neighbors=5)

# Create the StandardScaler
scaler = StandardScaler()

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit the model on the scaled training data
model.fit(X_train_scaled, y_train)

# Make predictions on the scaled test data
y_pred = model.predict(X_test_scaled)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE:', rmse)



# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)


RMSE: 1908533.7139365417
R-squared: 0.5560344759287444


In [None]:
#After comparing the RMSE and R-squared of Linear Regression, Polynomial Regression, Random Forest, XGBoost and KNNRegressor
# The optimal method producing the smallest RMSE and highest R-squared is Random Forest Regressor.