# Draft analysis 

---

Group name: Lena Breitberg, Ji Huixiao, Paraskevas Papadopoulos

---


## Introduction

*This section includes an introduction to the project motivation, data, and research question. Include a data dictionary* 

## Setup

In [355]:
import pandas as pd
import altair as alt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Data

## Import data

In [356]:
df = pd.read_csv('https://raw.githubusercontent.com/hdm-statistik-lpj/project/main/data/external/immobilienNuernberg.csv')

### Data structure

In [357]:
df

Unnamed: 0,General.EstateTypeKey,General.ReferenceNumber,General.DistributionTypeKey,General.ConstructionYear,General.LivingSpace,Tealium.object_locationid,Tealium.object_address_is_visible,Tealium.object_zip,Tealium.object_currency,Tealium.object_features,...,USAGE,SUITABILITY,CONDITION,HOUSECONDITION,PARKINGSLOT,ENERGY,BUILDINGTYPE,HEATING,ELEVATOR,EQUIPMENTS
0,WOHNUNG,001/G1/E114,ZUM_KAUF,2023.0,247.07,493480,True,90482,EUR,"['Neubau', 'WANNE', 'gaestewc', 'Bad/WC getren...",...,,,Erstbezug,Neubau,Tiefgarage,,,Fußbodenheizung,Personenaufzug,
1,WOHNUNG,,ZUM_KAUF,1900.0,69.00,493430,False,90461,EUR,"['Dach ausgebaut', 'WANNE', 'gaestewc', 'Bad/W...",...,vermietet,WG-geeignet,renoviert / saniert,,,Gas,,Etagenheizung,,
2,WOHNUNG,,ZUM_KAUF,1972.0,69.46,493465,False,90473,EUR,"['WANNE', 'GEPFLEGT', 'FERN', 'LAMINAT', 'FLIE...",...,vermietet,,gepflegt,,Tiefgarage,Fernwärme,,,,
3,WOHNUNG,X4_301,ZUM_KAUF,2023.0,82.81,493524,True,90429,EUR,"['Neubau', 'DUSCHE', 'Personenaufzug', 'ERSTBE...",...,,,Erstbezug,Neubau,,Blockheizkraftwerk,KfW 55,Fußbodenheizung,Personenaufzug,
4,WOHNUNG,X4_204,ZUM_KAUF,2023.0,75.52,493524,True,90429,EUR,"['Neubau', 'DUSCHE', 'Personenaufzug', 'Erdges...",...,,,Erstbezug,Neubau,,Blockheizkraftwerk,KfW 55,Fußbodenheizung,Personenaufzug,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,WOHNUNG,001/G1/E119,ZUM_KAUF,2023.0,110.32,493480,True,90482,EUR,"['Neubau', 'WANNE', 'gaestewc', 'DUSCHE', 'Kel...",...,,,Erstbezug,Neubau,Tiefgarage,,,Fußbodenheizung,Personenaufzug,
957,WOHNUNG,001/G1/E72,ZUM_KAUF,2023.0,103.11,493480,True,90482,EUR,"['Neubau', 'barriefrei', 'WANNE', 'gaestewc', ...",...,,barrierefrei,Erstbezug,Neubau,Tiefgarage,,,Fußbodenheizung,Personenaufzug,
958,WOHNUNG,,ZUM_KAUF,1965.0,91.00,493430,True,90441,EUR,"['Personenaufzug', 'Zentralheizung', 'LINOLEUM...",...,,,,,,,,Zentralheizung,Personenaufzug,
959,WOHNUNG,64415430,ZUM_KAUF,1958.0,34.00,493479,True,90409,EUR,"['DUSCHE', 'Kelleranteil', 'GAS', 'Zentralheiz...",...,frei,,,,,Gas,,Zentralheizung,,möbliert


In [358]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 49 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   General.EstateTypeKey                                  961 non-null    object 
 1   General.ReferenceNumber                                832 non-null    object 
 2   General.DistributionTypeKey                            961 non-null    object 
 3   General.ConstructionYear                               961 non-null    float64
 4   General.LivingSpace                                    961 non-null    float64
 5   Tealium.object_locationid                              961 non-null    int64  
 6   Tealium.object_address_is_visible                      961 non-null    bool   
 7   Tealium.object_zip                                     961 non-null    int64  
 8   Tealium.object_currency                           

### Data corrections

In [359]:
df_correct = pd.DataFrame(data={
    "contructionYear": df["General.ConstructionYear"],
    "livingSpace": df["General.LivingSpace"],
    "photosCount": df["Tealium.object_count_photos"].astype("Int64"),
    "localAmenetiesScore": df["LocalRatings.scores.local_amenities"] / 10,
    "localMobilityScore": df["LocalRatings.scores.mobility"] / 10,
    "latitude": df["Latitude"],
    "longitude": df["Longitude"],
    "rooms": df["ROOMS"],
    "priceCommoncharge": df["PRICE_COMMONCHARGE"],
    "energyConsumption": df["Value"],
    "elevator": df["ELEVATOR"],
    "equipments": df["EQUIPMENTS"],
    "price": df["PRICE"],
    "energyClass": df["Class"]
})

In [360]:
df_correct.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   contructionYear      961 non-null    float64
 1   livingSpace          961 non-null    float64
 2   photosCount          960 non-null    Int64  
 3   localAmenetiesScore  958 non-null    float64
 4   localMobilityScore   958 non-null    float64
 5   latitude             961 non-null    float64
 6   longitude            961 non-null    float64
 7   rooms                961 non-null    float64
 8   priceCommoncharge    423 non-null    float64
 9   energyConsumption    727 non-null    float64
 10  elevator             464 non-null    object 
 11  equipments           84 non-null     object 
 12  price                961 non-null    float64
 13  energyClass          622 non-null    object 
dtypes: Int64(1), float64(10), object(3)
memory usage: 106.2+ KB


In [361]:
energyClasses = {
    "A_PLUS": 1,
    "A": 2,
    "B": 3,
    "C": 4,
    "D": 5,
    "E": 6,
    "F": 7,
    "G": 8,
    "H": 9
}
df_correct["energyClass"] = df_correct["energyClass"].str \
    .removeprefix("DE_ENV2014_CLASS_") \
    .replace(energyClasses)

In [362]:
df_correct = pd.get_dummies(df_correct, columns=["equipments"], dummy_na=True)

In [363]:
df_correct["elevator"] = df_correct["elevator"].notnull().astype(int)

### Variable lists

In [364]:
# remove outlier for visualisation
df_correct_vis = df_correct.loc[df_correct["price"] < 80000000]

In [365]:
y_labels = ["price", "energyClass"]
x_labels = [item for item in df_correct.columns.to_list() if item not in y_labels]


### Data splitting

## Analysis

### Descriptive statistics

In [366]:
df_correct.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
contructionYear,961.0,1979.03538,36.413922,1872.0,1957.0,1976.0,2019.0,2025.0
livingSpace,961.0,77.198096,35.495184,19.35,56.58,72.51,90.0,532.0
photosCount,960.0,12.927083,9.509278,1.0,7.0,11.0,16.0,54.0
localAmenetiesScore,958.0,0.672192,0.166368,0.1,0.57,0.7,0.8,0.95
localMobilityScore,958.0,0.917996,0.09166,0.15,0.91,0.96,0.97,0.97
latitude,961.0,49.443274,0.024613,49.34839,49.42828,49.44504,49.46145,49.52838
longitude,961.0,11.082636,0.036428,11.00723,11.05372,11.08505,11.10224,11.26159
rooms,961.0,2.761186,1.215971,1.0,2.0,3.0,3.0,22.0
priceCommoncharge,423.0,259.189835,111.251378,65.0,179.5,246.0,320.0,1078.0
energyConsumption,727.0,102.342352,54.312412,0.0,67.6,103.3,135.25,326.0


### Exploratory data analysis

In [367]:
alt.renderers.enable('html')
chart = alt.Chart(df_correct_vis).mark_circle().encode(
    x=alt.X(alt.repeat("column"), 
            type='quantitative',
            scale=alt.Scale(zero=False)
            ),
    y=alt.Y(alt.repeat("row"), 
            type='quantitative',
            scale=alt.Scale(zero=False)
             )
).properties(
    width=150,
    height=150
).repeat(
    row=y_labels,
    column=x_labels
)
chart

In [368]:
df_correct.corr(numeric_only=True).style.background_gradient(cmap="Blues")

Unnamed: 0,contructionYear,livingSpace,photosCount,localAmenetiesScore,localMobilityScore,latitude,longitude,rooms,priceCommoncharge,energyConsumption,elevator,price,energyClass,equipments_möbliert,"equipments_möbliert, neuwertig",equipments_neuwertig,equipments_teilweise möbliert,"equipments_teilweise möbliert, neuwertig",equipments_nan
contructionYear,1.0,0.097712,0.000769,-0.159441,-0.173897,-0.050416,0.007825,-0.018218,0.236135,-0.59325,0.513437,0.067206,-0.650261,0.030853,0.023775,0.114398,-0.012416,0.004301,-0.091922
livingSpace,0.097712,1.0,0.158488,-0.054711,-0.045252,0.017941,0.022719,0.85347,0.588188,-0.126201,0.00566,0.224992,-0.110865,-0.091239,-0.096197,0.136161,-0.006778,0.021306,-0.011936
photosCount,0.000769,0.158488,1.0,-0.116974,-0.177677,-0.093733,0.159093,0.142482,0.083856,0.023861,0.094929,0.014377,0.014337,0.053944,-0.042078,0.038058,0.102877,-0.00249,-0.090422
localAmenetiesScore,-0.159441,-0.054711,-0.116974,1.0,0.668888,0.413507,-0.077323,-0.116746,-0.12566,0.038588,-0.07981,0.024274,0.086239,0.031409,0.046091,-0.016076,-0.014454,0.064373,-0.027851
localMobilityScore,-0.173897,-0.045252,-0.177677,0.668888,1.0,0.317165,-0.268571,-0.062603,-0.078038,0.061654,-0.005793,-0.00559,0.080427,0.042922,0.037954,-0.051735,0.010853,0.026155,-0.012408
latitude,-0.050416,0.017941,-0.093733,0.413507,0.317165,1.0,0.014648,-0.019756,0.054763,0.006024,0.018318,0.025015,-0.017397,0.091731,0.005873,-0.042056,0.030412,0.031802,-0.049354
longitude,0.007825,0.022719,0.159093,-0.077323,-0.268571,0.014648,1.0,0.025649,0.015183,0.172112,-0.000359,-0.042768,0.211484,0.001144,0.002836,-0.09436,-0.001717,-0.03455,0.068141
rooms,-0.018218,0.85347,0.142482,-0.116746,-0.062603,-0.019756,0.025649,1.0,0.462023,0.001152,-0.089424,0.04972,0.010492,-0.078055,-0.104801,0.074647,-0.001198,0.014211,0.021038
priceCommoncharge,0.236135,0.588188,0.083856,-0.12566,-0.078038,0.054763,0.015183,0.462023,1.0,-0.112491,0.218071,0.102348,-0.103634,0.03341,,0.164151,-0.029625,0.006194,-0.114512
energyConsumption,-0.59325,-0.126201,0.023861,0.038588,0.061654,0.006024,0.172112,0.001152,-0.112491,1.0,-0.374444,-0.058976,0.931486,-0.000418,-0.062438,-0.168554,0.016576,-0.061671,0.146112


### Relationships

## Model

### Select model

### Training and validation

### Fit model

### Evaluation on test set

### Save model



Save your model in the folder `models/`. Use a meaningful name and a timestamp.

## Conclusions