In [1]:
from typing import List, Tuple

import math
import pandas as pd
import numpy as np
import spacy
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
import os
data_path = "../data/"
keplergl_config_path = "../keplergl_config/"
os.listdir("../data")

['arc_wg_gesucht.csv',
 'gender_data.csv',
 'leipzigortsteile.json',
 'places.json',
 'places_world.json',
 'Summary_subpages_w_location3.csv',
 'Summary_subpages_w_location_Backup_06_06.txt',
 'wg_gesucht.csv']

In [3]:
# df = pd.read_csv(f"{data_path}wg_gesucht.csv")

In [19]:
df = pd.read_csv(f"{data_path}Summary_subpages_w_location3.csv", sep=',')
df = df.convert_dtypes()
for column in ["free_from"]:
    df[column] = df[column].astype(str).str.zfill(8)  # Füge führende Nullen hinzu, falls erforderlich
    df[column] = df[column] + f" 00:00:{np.random.randint(1, 59)}"
    df[column] = pd.to_datetime(df[column], format="%d.%m.%Y %H:%M:%S")



# NLP

In [21]:
nlp = spacy.load("de_core_news_lg")

In [22]:
# Erstelle eine leere Spalte mit dem Namen "full_description"
df['full_description'] = np.nan

# Kombiniere die Werte der ausgewählten Spalten zu einer neuen Spalte "full_description"
df['full_description'] = df[['room_description', 'location_description', 'flat_live', 'additional']].apply(
    lambda x: ' '.join([str(i) for i in x if not pd.isna(i)]), axis=1
)

## Places, Persons, Organisationen hinzufühgen

In [None]:
# Ausgabe der verfügbaren Labels
labels = nlp.get_pipe("ner").labels
print(labels)

In [23]:
def extract_entities(text: str)-> Tuple[List[str], List[str], List[str]]:
    """Extrahiert Orte, Personen und Organisationen aus dem gegebenen Text.

    Args:
        text (str): Der Text, aus dem die Entitäten extrahiert werden sollen.

    Returns:
        Tuple[List[str], List[str], List[str]]: Ein Tupel, das die extrahierten Entitäten enthält.
            Das erste Element ist eine Liste von Orten, das zweite Element eine Liste von Personen
            und das dritte Element eine Liste von Organisationen.
    """
    doc = nlp(text)
    places = [token.text for token in doc.ents if token.label_ == "LOC"]
    persons = [token.text for token in doc.ents if token.label_ == "PER"]
    organizations = [token.text for token in doc.ents if token.label_ == "ORG"]
    return places, persons, organizations

# Anwenden der Spacy-Pipeline auf die Spalte "full_description"
df[["places", "persons", "organizations"]] = df["full_description"].apply(extract_entities).apply(pd.Series)

### Places mit Koordinaten versehen

In [None]:
unique_places = set()

for places_list in df['places']:
    unique_places.update(places_list)
print(len(unique_places), unique_places)

In [None]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from time import sleep

# Erzeuge einen Nominatim-Geocoder
geolocator = Nominatim(user_agent='my_app')

# Funktion zum Geokodieren mit Rückversuchen bei Zeitüberschreitung
def geocode_with_retry(location):
    # city = "Leipzig"
    # country = "Deutschland"
    location = f"{location}"
    max_retries = 3
    retry_count = 0
    while retry_count < max_retries:
        try:
            return geolocator.geocode(location)
        except GeocoderTimedOut:
            print(f"Timeout{retry_count}: {location}")
            sleep(2 + retry_count)
            retry_count += 1
        except:
            print(f"Fehler{retry_count}: {location}")
            break
    return None


places_dict = {}
count1 = 0
count2 = 0
# for location in unique_places:

#     location_data = geocode_with_retry(location)

#     if location_data:
#         places_dict[location] = {
#             "status": True,
#             "lon": location_data.longitude,
#             "lat": location_data.latitude,
#             "raw": location_data.raw
#         }
#         count1 += 1
#     else:
#         places_dict[location] = {
#             "status": False,
#             "lon": 0.0,
#             "lat": 0.0,
#             "raw": {}
#         }
#         count2 += 1


In [25]:
with open(f"{data_path}places.json", "r", encoding='utf-8') as file:
    places_lpz = json.load(file)

In [26]:
clean_places = {}
for key, value in places_lpz.items():
    if value["stauts"] == True:
        clean_places[key] = value

In [None]:
for key, value in clean_places.items():
    if value['lat'] > 52 or value['lat'] < 50:
        print(f"{key}: {value['lat']}, {value['lon']}")
    elif value['lon'] > 13 or value['lon'] < 11:
        print(f"{key}: {value['lat']}, {value['lon']}")

In [27]:
my_dict = {
    "Hauptbahnhof": ["Leipziger Hbf.", "Hbf.", "CENTRALSTATION", "Hauptbahhof"],
    # "öpnv": ["Linie 90", "straßenbahn/bus", "linie", "tram", "bus", "bahn", "nahverkehr", "nahverkehrstechnisch", "S3", "Straßenbahn 14", "Straßenbahn Linie 14", "Straßenbahn 1", "Öffis", "S-Bahn/Straßenbahn", "Straßenbahn Linie 16"],
    "Völkerschlachtdenkmal": ["Völki"],
    # "staete": ["Mainz"],
    "Clara-Zetkin-Park": ["Clarapark", "Clara Park", "Clara-Zetkin\" city park", "Clara-Zetkin-Park befindetsich"],
    "Markkleeberg See": [],
    "Cospudener See": ["Cossi"],
    "Rosental": [],
    "Eisenbahnstraße": ["Eisi" ],
    "Wilhelm-Leuschner Platz": ["Wilhelm-Leuschner-Platz läufst du", "Wilhelm-Leuchner"],
    "Jahnallee": ["Jahnallee-Kampus", "Sport-Uni", "RedBull Arena"],
    "universität leipzig": ["Uni Hauptgebäude", "MDR Turm", "Mensa"],
    "Kulkwitzer": [],
    "Lene-Voigt-Park": ["Lenepark"],
    "universität leipzig": ["Zoo für Kulturprogramm", "Leipziger Zoo"],
    "Bayrischer Bahnhof": ["DerBayrische Bahnhof", "Bayrische Bahnhof"],
    # "Lebensmittel": ["Penny", "Norma", "Netto", "Rossman", "Edeka", "Aldi", "Konsum"],
 }


In [None]:
# Überprüfen ob auch alle keys in clean_places vorkommen
for key, value in my_dict.items():
    clean_places[key]

In [28]:
import difflib

def calculate_similarity(string1, string2):
    similarity = difflib.SequenceMatcher(None, string1.lower(), string2.lower()).ratio()
    return similarity

def map_similarity(string1, string2, threshold=0.8):
    similarity = calculate_similarity(string1, string2)
    if similarity >= threshold:
        return True
    else:
        return False

# Beispielaufruf
string1 = "Karl Heine Straße"
string2 = "Karl-Heine-Straße"
threshold = 0.8
result = map_similarity(string1, string2, threshold)
print(result)


True


In [29]:
def distance(lat1, lon1, lat2, lon2):
    # Umrechnung in Bogenmaß
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Deltas der Koordinaten
    delta_lat = lat2_rad - lat1_rad
    delta_lon = lon2_rad - lon1_rad

    # Haversine-Formel
    a = math.sin(delta_lat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    
    # Erdradius in Metern
    radius = 6371000
    
    # Berechnung der Distanz
    distance = radius * c

    return distance, 1/distance if distance > 0 else 0.0

# Beispielaufruf
lat1 = 51.338904
lon1 = 12.323058
lat2 = 51.340012
lon2 = 12.334702

result = distance(lat1, lon1, lat2, lon2)
print("Die Distanz zwischen den Koordinaten beträgt:", result, "Meter")

Die Distanz zwischen den Koordinaten beträgt: (818.168717734293, 0.0012222417923399) Meter


In [31]:
arc_data = []
unknown_places = []
for idx, row in df.iterrows():
    places: List = row["places"]
    if row["latitude"] > 52 or row["latitude"] < 50:
      continue
    for place in places:
      if place in clean_places.keys():
        d, rd = distance(row["latitude"], row["longitude"], clean_places[place]["lat"], clean_places[place]["lon"])
        arc_data.append({
                "name": place,
                "from_lat": row["latitude"],
                "from_lon": row["longitude"],
                "to_lat": clean_places[place]["lat"],
                "to_lon": clean_places[place]["lon"],
                "free_from": row["free_from"],
                "distance": d,
                "reverse_distance": rd
        })
      else:
        for key, value in my_dict.items():
          if place in value:
            d, rd = distance(row["latitude"], row["longitude"], clean_places[key]["lat"], clean_places[key]["lon"])
            arc_data.append({
                "name": key,
                "from_lat": row["latitude"],
                "from_lon": row["longitude"],
                "to_lat": clean_places[key]["lat"],
                "to_lon": clean_places[key]["lon"],
                "free_from": row["free_from"], 
                "distance": d,
                "reverse_distance": rd
            })
          elif any([map_similarity(place, v) for v in value]):
            d, rd = distance(row["latitude"], row["longitude"], clean_places[key]["lat"], clean_places[key]["lon"])
            arc_data.append({
                "name": key,
                "from_lat": row["latitude"],
                "from_lon": row["longitude"],
                "to_lat": clean_places[key]["lat"],
                "to_lon": clean_places[key]["lon"],
                "free_from": row["free_from"],
                "distance": d,
                "reverse_distance": rd
            })
          else:
            unknown_places.append(place)
unknown_places = set(unknown_places)
print(len(unknown_places), unknown_places)
arc_df = pd.DataFrame(data=arc_data)
arc_df

1385 {'29J', 'Kolmstrasse', 'Paris', 'Linie 9', 'Argentinien', 'Gretha', 'HInterhaus', 'Tageslicht-Wannenbad', 'Schlittschuh Laufen', 'Draußen', 'Located directly on Waldstraße', 'Ärzte', 'Berlin', 'Minutenan', 'Wanda', 'Albertina', 'Niedersächsischen Heide', 'Sharepic-Sachen', 'Leipzig for two months', 'Dresdner Str.', 'HBF', 'Straßenbahn/', 'only two minutes walk to the tram', 'Pia', 'Griechenland', 'Moritz-Hof', 'Südwest-Ausrichtung', 'MDR Turm', 'Bad putzen', 'Herrmann-Liebmann', 'a great neighborhood', 'RedBull Arena', 'läufst du  5', 'Freindenspark', 'A cupboard', 'Credestraße', 'Leipzig Lößnig', 'Wurzner Straße', 'Holsteinstraße', 'Asien', 'Potsdam', 'Balfolk', 'Ghent', 'Latzo', 'Goerdlerring', 'Frieda', 'Lützner-Straße', 'Karlotta', 'Bremen', 'Kaos-Villa', 'Lindenaus', 'Lehnstühle', 'Buslinien 72', 'Dm', 'Ilyass', 'Herd', 'Leipzig lindenau', 'groceries Aldi', 'Stadthafen', 'Ramdohrscher Park', 'ÖPVN', 'There is a large green area right', 'the house is friendly and quiet', 'suuu

Unnamed: 0,name,from_lat,from_lon,to_lat,to_lon,free_from,distance,reverse_distance
0,Plagwitz,51.326810,12.325284,51.328199,12.338300,2023-09-01 00:00:32,917.464884,0.001090
1,Gohlis,51.364652,12.360325,51.367060,12.376673,2023-07-01 00:00:32,1166.113810,0.000858
2,Karl-Heine-Straße,51.325779,12.327418,51.332158,12.348740,2023-08-01 00:00:32,1642.531082,0.000609
3,Clara-Zetkin-Park,51.321475,12.339788,51.330505,12.358851,2023-09-01 00:00:32,1662.168035,0.000602
4,Küchenholz,51.321475,12.339788,51.304778,12.337883,2023-09-01 00:00:32,1861.380804,0.000537
...,...,...,...,...,...,...,...,...
1272,Clara-Park,51.322204,12.367438,51.330505,12.358851,2023-09-01 00:00:32,1099.086932,0.000910
1273,Kolonnadenviertel,51.337929,12.364272,51.337856,12.365410,2023-08-01 00:00:32,79.444193,0.012587
1274,Johannapark,51.337929,12.364272,51.334636,12.363067,2023-08-01 00:00:32,375.602955,0.002662
1275,Rosental,51.345426,12.369663,51.349993,12.363226,2023-07-01 00:00:32,676.553433,0.001478


## Gender Sprache analysieren

In [32]:
def extract_gender_usage(text: str) -> Tuple[int, List[str], int, List[str], int, List[str]]:
    # Apply spaCy NLP pipeline to the text
    doc = nlp(text)

    # Extract male pronouns
    male_pronouns = [
        token.text for token in doc
        if token.pos_ == "PRON"
        and token.text.lower() in ["er", "ihn", "ihm", "sein", "seiner", "seinen", "seines", "dessen", "derjenige", "derjenige, der", "der", "seinerseits"]
   ]

    # Extract female pronouns
    female_pronouns = [
        token.text for token in doc
        if token.pos_ == "PRON"
        and token.text.lower() in ["sie", "ihr", "sie", "ihre", "ihrer", "ihren", "ihres", "deren", "diejenige", "diejenige, die", "die", "ihrerseits"]
    ]

    # Extract neutral pronouns and gender-neutral terms   token.text.lower().endswith("ees") "Menschis"
    neutral_pronouns = [
        token.text for token in doc
        if token.text.lower() in ["studierende", "teilnehmende", "lernende", "lehrende", "prüfende", "mitarbeitende", "beschäftigte", "studieninteressierte", "forschende", "absolventinnen und absolventen", "referierende", "promovierende", "suchende", "interessierte", "anfragende", "mieterinnen und mieter", "vermietende", "vertragspartner", "wohnungssuchende", "besichtigende", "maklerinnen und makler", "wohnungsanbieter", "umziehende", "wohnraumsuchende"]
    ]

    # Return the counts and lists of pronouns
    return len(male_pronouns), male_pronouns, len(female_pronouns), female_pronouns, len(neutral_pronouns), neutral_pronouns

df[["male_pronouns", "male_pronouns_",
    "female_pronouns", "female_pronouns_",
    "neutral_pronouns", "neutral_pronouns_"]
   ] = df["full_description"].apply(extract_gender_usage).apply(pd.Series)



In [33]:
def add_feature_count_coordinates(row:pd.Series, feature:str, idx:int) -> List[dict]:
    tmp_list = []
    for _ in range(row[feature]):
        tmp_list.append({
            "index": idx,
            "pronouns": feature,
            "lat": row["latitude"],
            "lon": row["longitude"],
            "free_from": row["free_from"]
            })
    return tmp_list

In [34]:
gender_data = []

for idx, row in df.iterrows():
    if row["latitude"] > 55 or row["latitude"] < 47:
        continue
    for feature in ["female_pronouns", "male_pronouns", "neutral_pronouns"]:
        row_data = add_feature_count_coordinates(row, feature, idx)
        if row_data:
            gender_data.extend(row_data)

gender_df = pd.DataFrame(data=gender_data)

In [None]:
df.iloc[[82,48,140,150,148], [30, 32, 34]]

## Clustering tf/idf

In [35]:
# Vektorisieren der Textdaten
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["full_description"])
# Anwenden des K-Means-Clustering-Algorithmus
num_clusters = 4  # Anzahl der gewünschten Cluster
kmeans = KMeans(n_clusters=num_clusters, random_state=2023)
kmeans.fit(X)

# Hinzufügen der Clusterlabels zum DataFrame
df["cluster_label"] = kmeans.labels_



# Output

In [None]:
df.info()

In [36]:
df.to_csv(f"{data_path}wg_gesucht.csv", index=False)

In [37]:
gender_df.to_csv(f"{data_path}gender_data.csv", index=False)

In [38]:
arc_df.to_csv(f"{data_path}arc_wg_gesucht.csv", index=False)

In [None]:
df[['full_description']].to_excel(f"{data_path}description.xlsx", index=False)
df.to_excel(f"{data_path}wg_gesucht.xlsx", index=False)

In [None]:
# Serializing json
json_object = json.dumps(places_dict, indent=4, ensure_ascii=False)

with open(f"{data_path}places.json", "w", encoding='utf-8') as outfile:
    outfile.write(json_object)