In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [2]:
rent_cast_data = pd.read_csv('rent_cast_data.csv')

rent_cast_data.head()

Unnamed: 0,city,zipCode,county,latitude,longitude,propertyType,bedrooms,bathrooms,squareFootage,lotSize,...,cooling_present,fireplace_present,garage_present,heating_present,pool_present,floorCount,garageSpaces,city_encoded,county_encoded,propertyType_encoded
0,Fort Myers,33916,Lee,26.652413,-81.855762,Condo,2.0,2.0,1003.0,44417.0,...,0,0,0,0,0,7.0,,42,17,1
1,Jacksonville,32277,Duval,30.363895,-81.610043,Single Family,3.0,2.0,1921.0,4464.0,...,1,0,1,1,0,2.0,2.0,61,9,4
2,Homestead,33032,Miami-Dade,25.532831,-80.379346,Single Family,3.0,2.5,1740.0,1870.0,...,0,0,0,0,0,2.0,,57,23,4
3,Jacksonville,32208,Duval,30.415227,-81.705227,Single Family,3.0,2.0,1196.0,42268.0,...,1,0,1,1,0,1.0,,61,9,4
4,Homestead,33033,Miami-Dade,25.486028,-80.423809,Single Family,3.0,2.0,1577.0,5032.0,...,0,0,1,0,1,1.0,2.0,57,23,4


In [3]:
cols = [ 
     'zipCode',
     'latitude',
    'longitude',
     'bedrooms',
     'bathrooms',
     'squareFootage',
     'lotSize',
     'yearBuilt',    
     'lastSalePrice',
     'm_rate',
    'cooling_present',
    'fireplace_present',
    'garage_present',
    'heating_present',
    'pool_present',
    'floorCount',
    'garageSpaces',
    'city_encoded',
    'county_encoded',
    'propertyType_encoded',
]

# # Applying pd.get_dummies to the selected columns and overwrite rent_cast_data
rent_cast_data= pd.get_dummies(rent_cast_data[cols])

# # Display the first few rows of the resulting dataframe
rent_cast_data.head()

Unnamed: 0,zipCode,latitude,longitude,bedrooms,bathrooms,squareFootage,lotSize,yearBuilt,lastSalePrice,m_rate,cooling_present,fireplace_present,garage_present,heating_present,pool_present,floorCount,garageSpaces,city_encoded,county_encoded,propertyType_encoded
0,33916,26.652413,-81.855762,2.0,2.0,1003.0,44417.0,2006.0,315000.0,0.0682,0,0,0,0,0,7.0,,42,17,1
1,32277,30.363895,-81.610043,3.0,2.0,1921.0,4464.0,1992.0,345000.0,0.0682,1,0,1,1,0,2.0,2.0,61,9,4
2,33032,25.532831,-80.379346,3.0,2.5,1740.0,1870.0,2016.0,445000.0,0.0682,0,0,0,0,0,2.0,,57,23,4
3,32208,30.415227,-81.705227,3.0,2.0,1196.0,42268.0,2022.0,236017.0,0.0682,1,0,1,1,0,1.0,,61,9,4
4,33033,25.486028,-80.423809,3.0,2.0,1577.0,5032.0,2006.0,380000.0,0.0682,0,0,1,0,1,1.0,2.0,57,23,4


In [4]:
X = rent_cast_data[[
    'zipCode', 'latitude', 'longitude', 'bedrooms', 'bathrooms', 'squareFootage',
    'lotSize', 'floorCount', 'yearBuilt', 'pool_present', 'cooling_present',
    'heating_present', 'fireplace_present', 'garage_present', 'garageSpaces',
    'm_rate', 'city_encoded', 'county_encoded', 'propertyType_encoded'  # these must exist
]]
y = rent_cast_data['lastSalePrice']  # Target (price)

In [5]:
#X = rent_cast_data.drop('lastSalePrice', axis=1) 
#y = rent_cast_data['lastSalePrice']  # Target (price)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [7]:
X_train_encoded = pd.get_dummies(X_train)

X_test_encoded = pd.get_dummies(X_test)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [8]:
xgb_model = XGBRegressor(random_state = 42)
xgb_model.fit(X_train_encoded, y_train)

In [9]:
y_pred_xgb = xgb_model.predict(X_test_encoded)

In [10]:
print(f'R^2 score: {r2_score(y_test, y_pred_xgb)}')
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred_xgb)}')

R^2 score: 0.7276336577720799
Mean Squared Error: 7884656827.32213


In [11]:
from joblib import dump

dump(xgb_model, open("xgb_model_default_42.sav", "wb"))

print("Model saved successfully using joblib.")

Model saved successfully using joblib.
