In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df = pd.read_csv('/content/nyc-rolling-sales-cleaned.csv')

In [16]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,APARTMENT NUMBER,ZIP CODE,...,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,TAX CLASS AT PRESENT_Categorized
0,Manhattan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,,392,6,,C2,,10009,...,0,5,1633.0,6440.0,1900,2,C2,6625000.0,2017-07,"Class 2: Larger Residential Properties (e.g., ..."
1,Manhattan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,,402,21,,C4,,10009,...,0,10,2272.0,6794.0,1913,2,C4,3936272.0,2016-09,"Class 2: Larger Residential Properties (e.g., ..."
2,Manhattan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,,404,55,,C2,,10009,...,0,6,2369.0,4615.0,1900,2,C2,8000000.0,2016-11,"Class 2: Larger Residential Properties (e.g., ..."
3,Manhattan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,,406,32,,C4,,10009,...,0,8,1750.0,4226.0,1920,2,C4,3192840.0,2016-09,"Class 2: Larger Residential Properties (e.g., ..."
4,Manhattan,ALPHABET CITY,14 RENTALS - 4-10 UNIT,,391,19,,S3,,10009,...,1,4,1520.0,3360.0,1910,2,S3,3300000.0,2016-11,"Class 2: Larger Residential Properties (e.g., ..."


In [17]:
#1

#I changed my variable types previously

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27586 entries, 0 to 27585
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   BOROUGH                           27586 non-null  object 
 1   NEIGHBORHOOD                      27586 non-null  object 
 2   BUILDING CLASS CATEGORY           27586 non-null  object 
 3   TAX CLASS AT PRESENT              0 non-null      float64
 4   BLOCK                             27586 non-null  int64  
 5   LOT                               27586 non-null  int64  
 6   EASE-MENT                         27586 non-null  object 
 7   BUILDING CLASS AT PRESENT         27586 non-null  object 
 8   APARTMENT NUMBER                  27586 non-null  object 
 9   ZIP CODE                          27586 non-null  int64  
 10  RESIDENTIAL UNITS                 27586 non-null  int64  
 11  COMMERCIAL UNITS                  27586 non-null  int64  
 12  TOTA

In [18]:
#2

df['target'] = np.log(df['SALE PRICE'])



# #2

If you keep 'SALE PRICE' in the model, then the model will most likely overfit as it is break the collinearity rule and will have redudant information.

In [26]:
#3
features = ['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
            'BLOCK', 'LOT', 'BUILDING CLASS AT PRESENT',
            'RESIDENTIAL UNITS', 'COMMERCIAL UNITS',
            'TOTAL UNITS', 'LAND SQUARE FEET', 'GROSS SQUARE FEET',
            'YEAR BUILT', 'TAX CLASS AT PRESENT_Categorized']

X = df[features]




In [27]:
#4
X = pd.get_dummies(X, drop_first=True)
print(X.shape[1])


377


In [28]:
#5
from sklearn.model_selection import train_test_split

y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000)


In [29]:
#6

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_hat_test = model.predict(X_test)

# Construct the log_predictions DataFrame
log_predictions = pd.DataFrame({'y_test': y_test, 'y_hat_test': y_hat_test, 'resid': y_test - y_hat_test})

# Calculate RMSE_test
RMSE_test = np.sqrt(mean_squared_error(y_test, y_hat_test))

print(f"RMSE_test: {RMSE_test}")


RMSE_test: 0.5280240478653218


**Improvement**

In [30]:
#7
from sklearn.model_selection import cross_val_score, KFold

# Initialize the model
model = LinearRegression()

# K=5 Cross-Validation
kf5 = KFold(n_splits=5, shuffle=True, random_state=1000)
cv_scores5 = cross_val_score(model, X, y, cv=kf5, scoring='neg_mean_squared_error')
RMSE_CV5 = np.sqrt(-cv_scores5.mean())

# K=10 Cross-Validation
kf10 = KFold(n_splits=10, shuffle=True, random_state=1000)
cv_scores10 = cross_val_score(model, X, y, cv=kf10, scoring='neg_mean_squared_error')
RMSE_CV10 = np.sqrt(-cv_scores10.mean())

print(f"RMSE_CV5: {RMSE_CV5}")
print(f"RMSE_CV10: {RMSE_CV10}")


RMSE_CV5: 0.5776914910440117
RMSE_CV10: 0.580598102703626


In [38]:
print(df.shape[0])



27586


In [31]:
#8
print(f"RMSE_test: {RMSE_test}")
print(f"RMSE_CV5: {RMSE_CV5}")
print(f"RMSE_CV10: {RMSE_CV10}")



RMSE_test: 0.5280240478653218
RMSE_CV5: 0.5776914910440117
RMSE_CV10: 0.580598102703626


As we add more folds we see an increase in the RMSE, which indicates the model is performing worse with a higher amount of folds or with smaller training sets. The RMSE_test is split 70-30. Where as the RMSE_CV5 and RMSE_CV5 contain a far smaller training/test set size for the training data. My takeaway is that it is often better to use a train, test split with large datasets that contain a diverse and complete makeup of your observations because using a cross validation technique would narrow the scope of what the model gets to train on.