In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:
df_user_raw = pd.read_parquet('data/user_filtered.parquet')

In [3]:
df_user = df_user_raw.copy()

In [8]:
df_user.head(25)

Unnamed: 0,user_id,account_created_date,first_active_timestamp,first_booking_date,user_gender,user_age,signup_platform,signup_process,user_language,marketing_channel,marketing_provider,first_tracked_affiliate,signup_application,first_device,first_web_browser,destination_country,first_active_date
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,NaT,,,affiliate,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF,2009-03-19
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,NaT,female,38.0,affiliate,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF,2009-05-23
2,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,2012-09-08,male,42.0,affiliate,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other,2009-10-31
3,osr2jwljor,2010-01-01,2010-01-01 21:56:19,2010-01-02,,,web,0,en,other,other,omg,Web,Mac Desktop,Chrome,US,2010-01-01
4,lsw9q7uk0j,2010-01-02,2010-01-02 01:25:58,2010-01-05,male,46.0,web,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US,2010-01-02
5,0d01nltbrs,2010-01-03,2010-01-03 19:19:05,2010-01-13,male,47.0,web,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US,2010-01-03
6,a1vcnhxeij,2010-01-04,2010-01-04 00:42:11,2010-07-29,male,50.0,web,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US,2010-01-04
7,6uh8zyj2gn,2010-01-04,2010-01-04 02:37:58,2010-01-04,,46.0,web,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,US,2010-01-04
8,yuuqmid2rp,2010-01-04,2010-01-04 19:42:51,2010-01-06,male,36.0,web,0,en,other,craigslist,untracked,Web,Mac Desktop,Firefox,US,2010-01-04
9,om1ss59ys8,2010-01-05,2010-01-05 05:18:12,NaT,male,47.0,web,0,en,other,craigslist,untracked,Web,iPhone,,NDF,2010-01-05


In [9]:
# Zähle die Häufigkeit jedes Wertes in 'first_web_browser'
browser_counts = df_user['first_web_browser'].value_counts()

# Identifiziere Werte mit Häufigkeit < 500
rare_browsers = browser_counts[browser_counts < 500].index.tolist()

# Ersetze diese Werte durch 'Other'
df_user['first_web_browser'] = df_user['first_web_browser'].replace(rare_browsers, 'Other')

In [10]:
# Speicherung der Analyse in txt-Datei

# Definiere die zu analysierenden Spalten
columns_to_analyze = ['user_gender', 'signup_platform', 'signup_process', 'user_language', 
                      'marketing_channel', 'marketing_provider', 'first_tracked_affiliate',
                      'signup_application', 'first_device', 'first_web_browser', 'destination_country']

# Erstelle den Inhalt der Datei
output_lines = ["Zusammenfassung der eindeutigen Werte pro Spalte in df_user_filtered"]
output_lines.append("=" * 60)
output_lines.append("")

for column in columns_to_analyze:
    if column in df_user.columns:
        # Получи значения и их количество
        value_counts = df_user[column].value_counts()
        total_count = len(df_user)
        
        output_lines.append(f"Spalte: {column}")
        output_lines.append(f"Anzahl eindeutiger Werte: {len(value_counts)}")
        output_lines.append("Wert - Anzahl - Prozent")
        
        for value, count in value_counts.items():
            percentage = (count / total_count) * 100
            output_lines.append(f"'{value}' - {count} - {percentage:.2f}%")
        
        output_lines.append("")
        output_lines.append("-" * 60)
        output_lines.append("")

# Speichere in Datei
output_file_path = 'scripts/outputs/df_user_filtered_unique_values_summary.txt'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(output_lines))

print(f"Datei erfolgreich gespeichert: {output_file_path}")

Datei erfolgreich gespeichert: scripts/outputs/df_user_filtered_unique_values_summary.txt


In [None]:
# Vorbereitung der Daten: NDF ausschließen und Geschlechter filtern
df_filtered = df_user[(df_user['user_gender'].notna()) & 
                     (df_user['user_gender'] != 'other') & 
                     (df_user['destination_country'] != 'NDF')].copy()

# 1. Vergleich: US vs. Rest der Welt
df_filtered['destination_group'] = df_filtered['destination_country'].apply(
    lambda x: 'US' if x == 'US' else 'rest_of_the_world'
)

# Prozentsätze für US vs. Rest berechnen
grouped_us_rest = df_filtered.groupby(['user_gender', 'destination_group']).size().reset_index(name='count')
total_per_gender_us_rest = grouped_us_rest.groupby('user_gender')['count'].transform('sum')
grouped_us_rest['percentage'] = (grouped_us_rest['count'] / total_per_gender_us_rest) * 100

# Erster Plot: US vs. Rest der Welt
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_us_rest, x='destination_group', y='percentage', hue='user_gender', palette='Set2')
plt.xlabel('Destination Gruppe')
plt.ylabel('Prozentsatz der Buchungen (%)')
plt.title('Buchungsvergleich: US vs. Rest der Welt nach Geschlecht')
plt.legend(title='Geschlecht')
plt.tight_layout()
plt.show()

# 2. Vergleich: Alle Länder außer US
df_non_us = df_filtered[df_filtered['destination_country'] != 'US'].copy()

# Prozentsätze für Länder außer US berechnen
grouped_non_us = df_non_us.groupby(['user_gender', 'destination_country']).size().reset_index(name='count')
total_per_gender_non_us = grouped_non_us.groupby('user_gender')['count'].transform('sum')
grouped_non_us['percentage'] = (grouped_non_us['count'] / total_per_gender_non_us) * 100

# Sortierung für den zweiten Plot nach Gesamthäufigkeit
grouped_total_non_us = grouped_non_us.groupby('destination_country')['count'].sum().reset_index(name='total_count')
destination_order_non_us = [dest for dest in grouped_total_non_us.sort_values('total_count', ascending=False)['destination_country'].tolist() if dest != 'other']
if 'other' in grouped_total_non_us['destination_country'].values:
    destination_order_non_us.append('other')

# Zweiter Plot: Länder außer US
plt.figure(figsize=(12, 6))
sns.barplot(data=grouped_non_us, x='destination_country', y='percentage', hue='user_gender', palette='Set2', order=destination_order_non_us)
plt.xticks(rotation=45)
plt.xlabel('Zieldestination (exkl. US)')
plt.ylabel('Prozentsatz der Buchungen (%)')
plt.title('Präferenzen bei der Wahl der Destination nach Geschlecht (exkl. US)')
plt.tight_layout()
plt.show()