# ANALIZA DANYCH

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

In [None]:
df = pd.read_csv("data.csv")
df = df.drop(['id'], axis=1)
MAPBOX_ACCESS_TOKEN = "" # add mapbox token here

### Rozmiar zbioru danych

In [None]:
print("Liczba obserwacji:",df.shape[0],"\nLiczba zmiennych:",df.shape[1])

In [None]:
centers = {
    'warszawa':(52.247160, 21.023756),
    'krakow':(50.054067, 19.939634),
    'poznan':(52.407567, 16.933073),
    'wroclaw':(51.109861, 17.027047),
    'gdansk':(54.366607, 18.626162),
    'gdynia':(54.510521, 18.506044),
    'szczecin':(53.424296, 14.578268),
    'bialystok':(53.133166, 23.159154),
    'katowice':(50.220179, 19.019888),
    'lublin':(51.237375, 22.564449),
    'lodz':(51.764111, 19.481958),
}

cities = {
    "warszawa":1,
    "krakow":6,
    "poznan":7,
    "wroclaw":5,
    "gdansk":2,
    "gdynia":3,
    "szczecin":9,
    "bialystok":13,
    "katowice":14,
    "lublin":12,
    "lodz":8,
}

In [None]:
from shapely import wkb

def get_map_shapes(df_shapes):
    df = pd.DataFrame(
        columns=["longitude", "latitude", "geo_object_type", "slug", "id"]
    )
    for i in range(len(df_shapes)):
        shape = wkb.loads(str(df_shapes.at[i, "boundaries"]), hex=True)
        df_coord = pd.DataFrame(
            list(shape.exterior.coords), columns=["longitude", "latitude"]
        )
        df_coord["geo_object_type"] = df_shapes.at[i, "geo_object_type"]
        df_coord["slug"] = df_shapes.at[i, "slug"]
        df_coord["city_id"] = df_shapes.at[i, "city_id"]
        df_coord["id"] = df_shapes.at[i, "id"]
        df = pd.concat([df, df_coord], ignore_index=True)
    return df


boundaries = pd.read_csv("districts_boundaries.csv")
BOUNDARIES = get_map_shapes(boundaries)

In [None]:
centers = {
    1:5,
    6:139,
    7:184,
    5:130,
    2:56,
    3:59,
    8:201,
    14:278,
    9:206,
    12:208,
    13:235,
}

def map_offers(df, city, color):
    fig = go.Figure()
    if city:
        for j in BOUNDARIES["id"].unique():
            fig.add_trace(
                go.Scattermapbox(
                    lat=BOUNDARIES[BOUNDARIES["id"] == j]["latitude"],
                    lon=BOUNDARIES[BOUNDARIES["id"] == j]["longitude"],
                    mode="lines",
                    fill = "toself" if j in centers.values() else None,
                    fillcolor=color if j in centers.values() else None,
                    line={"color": "grey"},
                    showlegend=False,
                ),
            )
    fig.add_trace(
        go.Scattermapbox(
            name=city,
            lat=df["latitude"],
            lon=df["longitude"],
            mode="markers",
            marker=go.scattermapbox.Marker(
                size=5, color="red", opacity=0.8
            ),
        )
    )
    return fig.update_layout(
        hovermode="closest",
        mapbox=dict(
            accesstoken=MAPBOX_ACCESS_TOKEN,
            bearing=0,
            center=go.layout.mapbox.Center(
                lat=centers[city][0], lon=centers[city][1]
            ) if city!="all" else go.layout.mapbox.Center(
                lat=52.065221, lon=19.252482
            ),
            pitch=0,
            zoom=5,
        ),
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
        mapbox_style="light",
        margin={"r": 0, "t": 0, "l": 0, "b": 0},
    )

In [None]:
HISTOGRAMS_VARIABLES = {
    "price": [1, 1],
    "size": [1, 2],
    "floor": [2, 1],
    "floors": [2, 2],
    "rooms": [3, 1],
    "year_built": [3, 2],
}

BAR_VARIABLES = {
    "standard": [1, 1],
    "balcony": [1, 2],
    "furnished": [1, 3],
    "parking": [2, 1],
    "elevator": [2, 2],
    "separate_kitchen": [2, 3],
    "storage": [3, 1],
    "internet": [3, 2],
    "tv": [3, 3],
}

In [None]:
def graph_listings_histograms(df, variables):
    fig = make_subplots(rows=4, cols=2)
    for var in variables.keys():
        fig.add_trace(
            go.Histogram(
                x=df[df[var] != ""][var],
                autobinx=False,
                nbinsx=50,
            ),
            row=variables[var][0],
            col=variables[var][1],
        )
        fig.add_vline(
            x=df[df[var] != ""][var].mean(),
            annotation_text="Mean: " + str(round(df[df[var] != ""][var].mean(), 2)),
            annotation_position="top",
            line_width=2,
            line_dash="dash",
            line_color="black",
            row=variables[var][0],
            col=variables[var][1],
        )
    fig.update_layout(
        height=1000,
        margin={"r": 0, "t": 20, "l": 0, "b": 0},
        showlegend=False,
        xaxis1={"title": list(variables.keys())[0]},
        xaxis2={"title": list(variables.keys())[1]},
        xaxis3={"title": list(variables.keys())[2]},
        xaxis4={"title": list(variables.keys())[3]},
        xaxis5={"title": list(variables.keys())[4]},
        xaxis6={"title": list(variables.keys())[5]},
        bargap=0.1,
    )
    return fig

In [None]:
def graph_listings_bar(df, variables):
    fig = make_subplots(rows=3, cols=3)
    for var in variables.keys():
        if var != "standard":
            fig.add_trace(
                go.Bar(
                    x=["True","False"],
                    y=[df[df[var]==True].shape[0],df[df[var]==False].shape[0]]
                ),
                row=variables[var][0],
                col=variables[var][1],
            )
        else:
            fig.add_trace(
                go.Bar(
                    x=["niski","dobry","wysoki"],
                    y=[df[df[var]=="niski"].shape[0], df[df[var]=="dobry"].shape[0], df[df[var]=="wysoki"].shape[0]]
                ),
                row=variables[var][0],
                col=variables[var][1],
            )
        fig.update_layout(
        height=1000,
        margin={"r": 0, "t": 20, "l": 0, "b": 0},
        showlegend=False,
        xaxis1={"title": list(variables.keys())[0]},
        xaxis2={"title": list(variables.keys())[1]},
        xaxis3={"title": list(variables.keys())[2]},
        xaxis4={"title": list(variables.keys())[3]},
        xaxis5={"title": list(variables.keys())[4]},
        xaxis6={"title": list(variables.keys())[5]},
        xaxis7={"title": list(variables.keys())[6]},
        xaxis8={"title": list(variables.keys())[7]},
        xaxis9={"title": list(variables.keys())[8]},
        bargap=0.1,
    )
    return fig

## Interaktywna mapa

In [None]:
map_offers(df, "all", "yellow")

### Zmienne ciągłe

In [None]:
graph_listings_histograms(df, HISTOGRAMS_VARIABLES)

### Zmienne dyskretne

In [None]:
graph_listings_bar(df, BAR_VARIABLES)

## Satystyki opisowe

In [None]:
df.describe()

In [None]:
df.describe(include='object')