In [6]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython import get_ipython
from IPython.core.display import HTML
from IPython.core.interactiveshell import InteractiveShell
from publib import fix_style, set_style
from sklearn.linear_model import LinearRegression
from statsmodels.compat import lzip

# import stata_setup
# stata_setup.config('c:\Program Files\Stata17', 'mp', splash=False)
# from pystata import stata

set_style(["origin"])
# plt.rcParams.update({'font.size': 10})

df = pd.read_parquet("df_soep2.gzip")

In [7]:
# Convert log net income back to its actual values
df['gen_income'] = np.exp(df['gen_income'])

In [8]:
from scipy.stats import mannwhitneyu


def calculate_weighted_median(data, weights=None):
    data = pd.Series(data).dropna()
    if weights is None:
        weights = np.ones(len(data))
    else:
        weights = pd.Series(weights).dropna()

    ordered_data, ordered_weights = zip(*sorted(zip(data, weights)))

    midpoint = 0.5 * sum(ordered_weights)
    cum_weights = np.cumsum(ordered_weights)
    below_midpoint_index = [i for i in range(len(cum_weights)) if cum_weights[i] <= midpoint]
    median_index = max(below_midpoint_index)

    return ordered_data[median_index]

# Function will compare two dataframes by mean and median of chosen variables or will create KDE plots, depending on chosen parameter
def plot_or_table_density(df1, df2, variables_dict, custom_labels=None, show_mean=True, show_median=True, output="plot"):
    summary_stats = []
    n = len(variables_dict)
    columns = ['Variable', 'Group 1: Mean', 'Group 1: Median', 'Group 2: Mean', 'Group 2: Median']

    if custom_labels:
        if len(custom_labels) == len(columns):
            columns = custom_labels

    if output == "plot":
        fig, axs = plt.subplots(n, 2, figsize=(10, 3 * n), tight_layout=True)
        colors = ["cornflowerblue", "lightcoral"]

    for idx, (var, label) in enumerate(variables_dict.items()):
        try:
            valid_df1 = df1[var].replace([np.inf, -np.inf], np.nan).dropna()
            valid_df2 = df2[var].replace([np.inf, -np.inf], np.nan).dropna()

            if valid_df1.empty or valid_df2.empty:
                continue

# Mann-Whitney U test, adds asterisks to significant differences in means

            stat, p_value = mannwhitneyu(valid_df1, valid_df2)
            if p_value < 0.05:
                label += '*' 

            mean1, median1 = np.mean(valid_df1), np.median(valid_df1)
            mean2, median2 = np.mean(valid_df2), np.median(valid_df2)

            summary_stats.append(dict(zip(columns, [label, mean1, median1, mean2, median2])))

            if output == "plot":
                for (data, ax, color, df_label, mean, median) in zip(
                        [valid_df1, valid_df2],
                        [axs[idx, 0], axs[idx, 1]],
                        colors,
                        ["Group 1", "Group 2"],
                        [mean1, mean2],
                        [median1, median2]):

                    sns.kdeplot(data, ax=ax, fill=True, color=color, label=df_label, bw_adjust=1)

                    if show_mean:
                        ax.axvline(mean, color="k", linestyle="--", label="Mean")

                    if show_median:
                        ax.axvline(median, color="k", linestyle="-", label="Median")

                    ax.set_title(f"{label} - {df_label}")
                    ax.legend()

        except Exception as e:
            print(f"An error occurred while processing the variable '{var}': {e}")
            continue

    if output == "plot":
        plt.show()
    elif output == "table":
        summary_df = pd.DataFrame(summary_stats, columns=columns)
        display(summary_df)
        return summary_df

# List of labels
variables_dict = {
    "continent_Africa": "CONT: Africa",
    "continent_Asia": "CONT: Asia",
    "continent_Europe": "CONT: Europe",
    "continent_North_America": "CONT: N. America",
    "continent_Oceania": "CONT: Oceania",
    "continent_South_America": "CONT: S. America",
    "age": "Age",
    "bad_health": "Bad Health",
    "sex": "Female",
    "satisfaction": "Life Satisfaction",
    "german": "German Citizenship",
    "gen_income": "Net Income",
    "num_children": "Number of Children",
    "refugee": "Refugee",
    "edu_years": "Years of Education",
    "migr_age": "Age at Migration",
    "xenophobia": "Concern about Xenophobia",
    "discrimination": "Disadvantaged due to Origin",
    "dist_origin": "Distance to Origin Country",
    "feel_german": "Feeling German",
    "satisf_peers": "Life Satisfaction of Peers",
    "lang_profic": "Language Proficiency",
    "visit_germ": "Visited Germans in Last Year",
    "y_in_germany": "Years since Migration",
    "cl_friends": "Number of Close Friends",
    "lonely": "Experiencing Loneliness",
    "gen_wpartner": "Living with Legal Partner",
    "gen_seppart": "Separated from Legal Partner",
    "gen_single": "Single",
    "gen_wid_div": "Widowed or Divorced",
    "gen_vocation": "Student/in Training",
    "gen_employed": "Employed",
    "gen_retired": "Retired",
    "gen_regunempl": "Unemployed",
    "sat_work": "Satisfaction with Work",
    "sat_hhinc": "Satisfaction with Income",
    "sat_dwell": "Satisfaction with Dwelling",
    "sat_leisure": "Satisfaction with Leisure",
    "sat_family": "Satisfaction with Family Life",
    "sat_sleep": "Satisfaction with Sleep"
}

# Direct migrants and all others

df1 = df[df["migback"] != 2]
df2 = df[df["migback"] == 2]

table1 = plot_or_table_density(df1, df2, variables_dict, output='table')

# Female and male migrants

df1 = df[(df['migback'] == 2) & (df['sex'] == 1)]
df2 = df[(df['migback'] == 2) & (df['sex'] == 0)]

custom_labels = ['Variable', 'Females: Mean', 'Females: Median', 'Males: Mean', 'Males: Median']
table2 = plot_or_table_density(df1, df2, variables_dict, custom_labels=custom_labels, output="table")


latex_table1 = table1.to_latex(index=False, float_format="{:0.4f}".format)
with open('table1.tex', 'w') as file:
    file.write(latex_table1)
    
latex_table2 = table2.to_latex(index=False, float_format="{:0.4f}".format)
with open('table2.tex', 'w') as file:
    file.write(latex_table2)

Unnamed: 0,Variable,Group 1: Mean,Group 1: Median,Group 2: Mean,Group 2: Median
0,CONT: Africa*,0.0,0.0,0.032509,0.0
1,CONT: Asia*,0.0,0.0,0.403166,0.0
2,CONT: Europe*,1.0,1.0,0.496154,0.0
3,CONT: N. America*,0.0,0.0,0.00939,0.0
4,CONT: Oceania*,0.0,0.0,0.000684,0.0
5,CONT: S. America*,0.0,0.0,0.008052,0.0
6,Age*,47.037181,46.0,42.384825,41.0
7,Bad Health*,0.165157,0.0,0.172405,0.0
8,Female*,0.527983,1.0,0.496363,0.0
9,Life Satisfaction*,7.13424,8.0,7.201835,8.0


Unnamed: 0,Variable,Females: Mean,Females: Median,Males: Mean,Males: Median
0,CONT: Africa*,0.02592,0.0,0.039004,0.0
1,CONT: Asia*,0.376229,0.0,0.429714,0.0
2,CONT: Europe*,0.5275,1.0,0.465261,0.0
3,CONT: N. America,0.009843,0.0,0.008944,0.0
4,CONT: Oceania,0.00071,0.0,0.000657,0.0
5,CONT: S. America*,0.009292,0.0,0.006829,0.0
6,Age,42.466643,40.0,42.304188,41.0
7,Bad Health*,0.193123,0.0,0.15158,0.0
8,Female*,1.0,1.0,0.0,0.0
9,Life Satisfaction*,7.223913,8.0,7.180075,8.0


  latex_table1 = table1.to_latex(index=False, float_format="{:0.4f}".format)
  latex_table2 = table2.to_latex(index=False, float_format="{:0.4f}".format)
