In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from google.colab import files
import io

uploaded = files.upload()
filename = next(iter(uploaded))
data = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='ISO-8859-1')

Saving MSCI436_Dataset.csv to MSCI436_Dataset.csv


In [4]:
features = ['city', 'province', 'lease_term', 'type', 'beds', 'baths', 'sq_feet', 'furnishing', 'Pets']
target = 'price'

data = data.dropna(subset=[target])

if 'sq_feet (don’t use)' in data.columns:
    data = data.drop(columns=['sq_feet (don’t use)'])

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

data = remove_outliers(data, 'price')
data = remove_outliers(data, 'sq_feet')

X = data[features]
y = data[target]

categorical_features = ['city', 'province', 'lease_term', 'type']
numerical_features = ['beds', 'baths', 'sq_feet']
boolean_features = ['furnishing', 'Pets']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='if_binary'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features),
        ('bool', boolean_transformer, boolean_features)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 213.2743600133048


In [5]:
import joblib

joblib.dump(model, 'lease_price_model.joblib')

['lease_price_model.joblib']