# Plaintext linear regression

In this Jupyter playbook, the plaintext linear regression model is implemented by using the dataset Apartment rental offers in Germany from kaggle (https://www.kaggle.com/datasets/corrieaar/apartment-rental-offers-in-germany). The code is based on the code from the module Advanced Machine Learning Exercises, made by Solange Emmenegger (Solange Emmenegger, Hochschule Luzern, Module Advanced Machine Learning, accessed on 19 April 2024 at https://gitlab.renku.hslu.ch/solange.emmenegger/ml-adml-hslu/-/tree/master/notebooks/03A%20Supervised%20Learning, and https://gitlab.renku.hslu.ch/solange.emmenegger/ml-adml-hslu/-/blob/master/notebooks/04B%20Gradient%20Descent/Gradient%20Descent.ipynb) and modified where necessary. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from tqdm.notebook import tqdm
from collections import defaultdict
from ipywidgets import interact

%matplotlib inline

In [None]:
df = pd.read_csv("immo_data.csv")

columns_to_drop = ['serviceCharge', 'telekomTvOffer', 'noParkSpaces', 'pricetrend','regio3', 'heatingType', 'telekomUploadSpeed', 'telekomHybridUploadSpeed', 'lastRefurbish', 'newlyConst', 'picturecount', 'firingTypes', 'hasKitchen', 'geo_bln','cellar', 'yearConstructedRange', 'baseRent', 'houseNumber', 'geo_krs', 'interiorQual', 'petsAllowed', 'street', 'streetPlain', 'baseRentRange', 'geo_plz', 'thermalChar', 'floor', 'numberOfFloors', 'noRoomsRange', 'garden', 'livingSpaceRange', 'description', 'facilities', 'heatingCosts', 'energyEfficiencyClass', 'electricityBasePrice', 'electricityKwhPrice', 'date' ]
df = df.drop(columns=columns_to_drop, axis=1)
df['regio1'] = df.regio1.astype('category')
df['regio2'] = df.regio1.astype('category')
df['balcony'] = df['balcony'].astype(int)
df['lift'] = df['lift'].astype(int)
df['condition'] = df.condition.astype('category')
df['typeOfFlat'] = df.typeOfFlat.astype('category')
df = df.dropna()
df = df[(df['totalRent'] != 0) & (df['livingSpace'] != 0) & (df['yearConstructed'] > 1940) & (df['yearConstructed'] < 2021)]
numerical_cols = ['totalRent', 'yearConstructed', 'livingSpace', 'noRooms']
# Remove outliers
q3 = df.loc[:, numerical_cols].describe().loc['75%']
iqr = q3 - df.loc[:, numerical_cols].describe().loc['25%']
upper_boundary = q3 + 1.5*iqr
upper_boundary

df = df[(df.totalRent <= upper_boundary.totalRent) &
        (df.yearConstructed <= upper_boundary.yearConstructed) &
        (df.livingSpace <= upper_boundary.livingSpace) &
         (df.noRooms <= upper_boundary.noRooms) ]

df = pd.concat([df, pd.get_dummies(df.regio1)], axis='columns')
df = pd.concat([df, pd.get_dummies(df.regio2)], axis='columns')
df = pd.concat([df, pd.get_dummies(df.condition)], axis='columns')
df = pd.concat([df, pd.get_dummies(df.typeOfFlat)], axis='columns')
df.drop('regio1', axis='columns', inplace=True)
df.drop('regio2', axis='columns', inplace=True)
df.drop('condition', axis='columns', inplace=True)
df.drop('typeOfFlat', axis='columns', inplace=True)
df.drop(['scoutId'], axis='columns', inplace=True)

train_rents, test_rents = train_test_split(df, test_size=0.4, random_state=42, shuffle=True)
scaler = StandardScaler()
train_rents = pd.DataFrame(scaler.fit_transform(train_rents), columns=train_rents.columns, index=train_rents.index)
test_rents = pd.DataFrame(scaler.transform(test_rents), columns=test_rents.columns, index=test_rents.index)
X_train_rents = train_rents.drop(columns=["totalRent"]).values
X_test_rents = test_rents.drop(columns=["totalRent"]).values
y_train_rents = train_rents.totalRent.values
y_test_rents = test_rents.totalRent.values

