In [4]:
# imports
import ijson
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor

In [5]:
# set options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Dataset Extraction from json-format

In [7]:
def extract_json_data(json_event, no_of_entries):
    return pd.json_normalize(json_event)

def extract_json_data_into_dataframe(json_file_path: str, no_of_entries: int) -> pd.DataFrame:
    df = pd.DataFrame()
    with open(json_file_path, 'r') as f:
        json_events = ijson.items(f, 'item')
        with ThreadPoolExecutor() as executor:
            futures = []
            for i, json_event in enumerate(json_events):
                future = executor.submit(extract_json_data, json_event, no_of_entries)
                futures.append(future)
                if i % no_of_entries == 0:
                    print(f'Extracted {i} events from Stuttgart.')


            for i, future in enumerate(futures):
                single_df = future.result()
                df = pd.concat([df, single_df], ignore_index=True)
                if i % no_of_entries*50 == 0:
                    print(f'Added {i} events to dataframe.')

    return df

df = extract_json_data_into_dataframe('stuttgart_events.json', 100)
print(df)


Extracted 0 events from Stuttgart.
Extracted 100 events from Stuttgart.
Extracted 200 events from Stuttgart.
Extracted 300 events from Stuttgart.
Extracted 400 events from Stuttgart.
Extracted 500 events from Stuttgart.
Extracted 600 events from Stuttgart.
Extracted 700 events from Stuttgart.
Extracted 800 events from Stuttgart.
Extracted 900 events from Stuttgart.
Extracted 1000 events from Stuttgart.
Extracted 1100 events from Stuttgart.
Extracted 1200 events from Stuttgart.
Extracted 1300 events from Stuttgart.
Extracted 1400 events from Stuttgart.
Extracted 1500 events from Stuttgart.
Extracted 1600 events from Stuttgart.
Extracted 1700 events from Stuttgart.
Extracted 1800 events from Stuttgart.
Extracted 1900 events from Stuttgart.
Extracted 2000 events from Stuttgart.
Extracted 2100 events from Stuttgart.
Extracted 2200 events from Stuttgart.
Extracted 2300 events from Stuttgart.
Extracted 2400 events from Stuttgart.
Extracted 2500 events from Stuttgart.
Extracted 2600 events fr

: 

# Data Preprocessing 🧑‍🦯

In [None]:
def remove_events_not_in_stuttgart(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove all events not in Stuttgart by looking at the evntData.location.location.city column
    :param df: dataframe containing the events
    :return: dataframe containing only events in Stuttgart (might be the same as inpt df)
    """
    df = df[df['eventData.location.location.address.city'] == 'Stuttgart']
    return df

df = remove_events_not_in_stuttgart(df)

In [None]:
print(df.shape)

### Check for NA values

In [None]:
# check for NA values
print(df.isna().sum())

In [None]:
# Create a heatmap of missing values
plt.figure(figsize=(25, 6))  # Adjust the figure size as needed
sns.heatmap(df.isnull(), cmap='viridis', cbar=False)
plt.title('Missing Values Heatmap')
plt.show()

Interpretation: The yellow area shows the NA values. There is a lot of missing data 😨

In [None]:
# delete columns with more than 10% missing values
df = df[df.columns[df.isnull().mean() < 0.1]]
print(df.shape)