In [None]:
import numpy as np
from numpy.core.fromnumeric import size

import pandas as pd

import plotly.express as px
import plotly.graph_objs as go

import matplotlib.pyplot as plt

In [None]:
def replace_classified_single(df: pd.Series, labeling, NaN_values):
    #Fehlende Werte ersetzen
    df.replace(NaN_values, np.nan, inplace=True)
    #DF kopieren, aufsteigend sortieren
    df = df.copy()
    # df = df.sort_values(by=column)
    codes = decoding.loc[decoding['ID']==labeling]
    for i in range(0, len(codes)):
        df.replace(codes.iloc[i,3],
                   codes.iloc[i,1],
                   inplace=True)
    return df

In [None]:
decoding = pd.read_csv('codes.csv',
                       sep=';',
                       skiprows=0,
                       encoding = 'unicode_escape',
                       dtype=object)

df_korea = pd.read_csv('Data/Korea.csv',header=1, sep=';')
df_canada = pd.read_csv('Data/Canada.csv',header=1, sep=';')
df_china = pd.read_csv('Data/China.csv',header=1, sep=';')
df_france = pd.read_csv('Data/France.csv',header=1, sep=';')
df_india = pd.read_csv('Data/India.csv',header=1, sep=';')
df_greatbritain = pd.read_csv('Data/GreatBritain.csv',header=1, sep=';')
df_italy = pd.read_csv('Data/Italy.csv',header=1, sep=';')
df_usa = pd.read_csv('Data/USA.csv',header=1, sep=';')
df_germany = pd.read_csv('Data/Germany.csv',header=1, sep=';')

In [None]:
all_dfs = [(df_korea,'Korea'),
        (df_canada,'Canada'),
        (df_china,'China'),
        (df_france,'France'),
        (df_germany,'Germany'),
        (df_india,'India'),
        (df_greatbritain,'Great Britain'),
        (df_italy,'Italy'),
        (df_usa,'USA')]

# Chapter 3 - Univariate Statistics {.unnumbered}

## 3.1 Frequencies {.unnumbered}

In [None]:
#Gender distribution for Canada:
df = df_canada['Sexual orientation']
df.replace([999,-99,1], np.nan, inplace =True)
df = df.copy()
df = df.sort_values(ascending=False)
codes = decoding.loc[decoding['ID']=='v9913d_demo_sexualorientation']
for i in range(0, len(codes)):
    df.replace(int(codes.iloc[i,3]),codes.iloc[i,1], inplace=True)

In [None]:
counts = df.value_counts()
counts

In [None]:
px.bar(counts)

In [None]:
stacked_gen = pd.DataFrame(columns=['country',
                    'Baby Boomer (1946-1964)',
                    'Generation X (Baby Bust) (1965-1979)',
                    'Millennials / Generation Y (1980-1994)',
                    'IGen / Gen Z (1995-2012)'
                    ])
Nan_values = [999,998]
for data in all_dfs:
    print(data[1])
    df = data[0]
    ages = df['Age (generations)']
    ages = ages.copy()
    dec_ages = replace_classified_single(ages,
                                         'v0013g_demo_generation',
                                         Nan_values)
    values_ages = dec_ages.value_counts().sort_index()
    sed = values_ages.tolist()
    sed.insert(0,data[1])
    stacked_gen.loc[len(stacked_gen)] = sed

In [None]:
stacked_gen.plot(kind='bar', x='country', stacked=True)
plt.xlabel('country')
plt.ylabel('number of participants')

label_counts = [sum(row[1:]) for i, row in stacked_gen.iterrows()]
for i in range(len(stacked_gen)):
    plt.text(x = i-0.35,
             y = label_counts[i]+500,
             s = str(label_counts[i]),
             size = 8)
plt.show()

In [None]:
stacked_gen_norm = pd.DataFrame(columns=['country',
                        'Baby Boomer (1946-1964)',
                        'Generation X (Baby Bust) (1965-1979)',
                        'Millennials / Generation Y (1980-1994)',
                        'IGen / Gen Z (1995-2012)'])
for i in range (0, len(stacked_gen)):
    array = stacked_gen.iloc[i][1:]
    total = array.sum()
    stacked_gen_norm.loc[len(stacked_gen_norm)] = array.div(total)
stacked_gen_norm['country'] = stacked_gen['country']

In [None]:
stacked_gen_norm.plot(kind='bar', x='country', stacked=True)
plt.xlabel('country')
plt.ylabel('fraction of participants')
plt.show()

In [None]:
#stacked chart of gender distribution in LGBTQ+ community
stacked_lgbtq_norm = pd.DataFrame(columns=['country',
                                            'Asexual',
                                            'Bisexual',
                                            'Homosexual',
                                            'Pansexual',
                                            'Queer',
                                            'Other'])
Nan_values = [999,-99,1]
for data in all_dfs:
    print(data[1])
    df = data[0]
    genders = df['Sexual orientation']
    genders = genders.copy()
    dec_genders = replace_classified_single(genders,
                                            'v9913d_demo_sexualorientation',
                                            Nan_values
                                            )
    values_genders = dec_genders.value_counts().sort_index()
    sed = values_genders.tolist()
    sed.insert(0,data[1])
    stacked_lgbtq_norm.loc[len(stacked_lgbtq_norm)] = sed
    array = stacked_lgbtq_norm.loc[len(stacked_lgbtq_norm)-1][1:]
    total = array.sum()
    stacked_lgbtq_norm.loc[len(stacked_lgbtq_norm)-1][1:] = array.div(total)

In [None]:
stacked_lgbtq_norm.plot(kind='bar', x='country', stacked=True)
plt.xlabel('country')
plt.ylabel('fraction of participants')
plt.show()

In [None]:
stacked_ages = pd.DataFrame(columns=['country',
                                    '18 - 19 years',
                                    '20 - 24 years',
                                    '25 - 29 years',
                                    '30 - 34 years',
                                    '35 - 39 years',
                                    '40 - 44 years',
                                    '45 - 49 years',
                                    '50 - 54 years',
                                    '55 - 59 years',
                                    '60 - 64 years',
                                    ])
Nan_values = [999,998]
for data in all_dfs:
    print(data[1])
    df = data[0]
    ages = df['Age (5-year brackets)']
    ages = ages.copy()
    dec_ages = replace_classified_single(ages, 'v0013e_demo_agecat', Nan_values)
    values_ages = dec_ages.value_counts().sort_index()
    sed = values_ages.tolist()
    sed.insert(0,data[1])
    stacked_ages.loc[len(stacked_ages)] = sed

In [None]:
stacked_ages_norm = stacked_ages.copy()
stacked_ages_norm = stacked_ages_norm[0:0]
stacked_ages_norm = stacked_ages_norm.drop('country',axis= 1)

for row in stacked_ages.iterrows():
    total = row[1][1:].sum()
    stacked_ages_norm.loc[len(stacked_ages_norm)] = row[1][1:].div(total)
    stacked_ages_norm['country'] = stacked_ages['country']

brackets = stacked_ages.columns.to_list()[1:]

fig = go.Figure()

fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[0][1:].tolist(),
                        name='Korea',
                        line=dict(color='lightblue',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[1][1:].tolist(),
                        name='Canada',
                        line=dict(color='lightcyan',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[2][1:].tolist(),
                        name='China',
                        line=dict(color='deeppink',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[3][1:].tolist(),
                        name='France',
                        line=dict(color='lightsteelblue',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[4][1:].tolist(),
                        name='Germany',
                        line=dict(color='lightslategray',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[5][1:].tolist(),
                        name='India',
                        line=dict(color='rebeccapurple',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[6][1:].tolist(),
                        name='Great Britain',
                        line=dict(color='lightpink',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[7][1:].tolist(),
                        name='Italy',
                        line=dict(color='lightseagreen',
                        width=2)
                        ))
fig.add_trace(go.Scatter(x=brackets, 
                        y=stacked_ages_norm.iloc[8][1:].tolist(),
                        name='USA',
                        line=dict(color='lightgreen',
                        width=2)
                        ))
fig.update_layout(title='Number of sample participants by 5-year age brackets in nine countries',
                   yaxis_title='fraction of participants',
                   xaxis_title='Age (5-year brackets)',
            width=880,
            height=600)
fig.update_yaxes(nticks=11)
fig.show()

## 3.2 Location and Dispersion Parameters {.unnumbered}

In [None]:
life_values_comparison = pd.DataFrame(columns=['country','non_lgbtq_mean','lgbtq_mean','delta'])
for data in all_dfs:
    print(data[1])
    data[0]['tech_consumer_personality'] = 0
    # innovative options
    data[0]['tech_consumer_personality'] = np.where(data[0]['I like trying out innovative products'] == 1,
                                        data[0]['tech_consumer_personality']+10,
                                        data[0]['tech_consumer_personality']+0)
    data[0]['tech_consumer_personality'] = np.where(data[0]['I like staying up to date with new technology'] == 1,
                                        data[0]['tech_consumer_personality']+10,
                                        data[0]['tech_consumer_personality']+0)
    data[0]['tech_consumer_personality'] = np.where(data[0]["Among my friends I'm usually the first to try out a new technology"] == 1,
                                        data[0]['tech_consumer_personality']+10,
                                        data[0]['tech_consumer_personality']+0)
    data[0]['tech_consumer_personality'] = np.where(data[0]['Always owning the latest technology is important to me'] == 1,
                                        data[0]['tech_consumer_personality']+10,
                                        data[0]['tech_consumer_personality']+0)
    # traditional options:
    data[0]['tech_consumer_personality'] = np.where(data[0]['Products that I buy have to meet the highest standards'] == 1,
                                        data[0]['tech_consumer_personality']-20,
                                        data[0]['tech_consumer_personality']+0)
    data[0]['tech_consumer_personality'] = np.where(data[0]['I only buy new technology when it has proven successful'] == 1,
                                        data[0]['tech_consumer_personality']-20,
                                        data[0]['tech_consumer_personality']+0)
    lgbtq_mean = data[0][data[0]['Sexual orientation'] > 1]['tech_consumer_personality'].mean()
    non_lgbtq_mean = data[0][data[0]['Sexual orientation'] == 1]['tech_consumer_personality'].mean()
    delta = lgbtq_mean - non_lgbtq_mean
    sed = [data[1], round(non_lgbtq_mean,3),round(lgbtq_mean,3),round(delta,3)]
    life_values_comparison.loc[len(life_values_comparison)] = sed

In [None]:
all_dfs[0][0]['tech_consumer_personality'].describe()

In [None]:
fig, ax = plt.subplots(figsize=(9, 4))

# Remove y-axis tick marks
ax.yaxis.set_ticks_position('none')
# Add major gridlines in the x-axis
ax.grid(color='grey',
        axis='x',
        linestyle='dotted',
        linewidth=0.75,
        alpha=0.75)
# Set plot title
ax.set_title('variable tech_consumer_personality')
# Set species names as labels for the boxplot
dataset = [all_dfs[0][0]['tech_consumer_personality']]
labels = [all_dfs[0][1]]
ax.boxplot(dataset,
        widths = 0.5,
        labels=labels,
        vert=False)
ax.set_xlabel('tech_consumer_personality')
plt.savefig('box_plot_singlex.svg')
plt.show()

In [None]:
korea = all_dfs[0][0]
lgbtq_korea = korea[korea['Sexual orientation'] > 1]
non_lgbtq_korea = korea[korea['Sexual orientation'] == 1]

fig, ax = plt.subplots(figsize=(9, 4))
# Remove y-axis tick marks
ax.yaxis.set_ticks_position('none')
# Add major gridlines in the x-axis
ax.grid(color='grey',
        axis='x',
        linestyle='dotted',
        linewidth=0.75,
        alpha=0.75)
# Set plot title
ax.set_title('Korea tech_consumer_personality')
# Set species names as labels for the boxplot
dataset = [lgbtq_korea['tech_consumer_personality'],
            non_lgbtq_korea['tech_consumer_personality']
            ]
labels = ['LGBTQ+', 'non-LGBTQ+']
ax.boxplot(dataset,
        widths = 0.6,
        labels=labels,
        vert=False)
ax.set_xlabel('tech_consumer_personality')
plt.savefig('box_plot_doublexy.svg')
plt.show()

In [None]:
# concatenate all df's
frames = [df[0].set_index('Sexual orientation') for df in all_dfs 
        if ('Sexual orientation' in df[0].columns)]
holistic = pd.concat(frames)
holistic = holistic.copy()
holistic.reset_index(inplace=True)

In [None]:
holistic_lgbtq = holistic1[holistic1['Sexual orientation']>1]
holistic_non_lgbtq = holistic1[holistic1['Sexual orientation']==1]

In [None]:
holistic_lgbtq['tech_consumer_personality'].describe()

In [None]:
holistic_non_lgbtq['tech_consumer_personality'].describe()

In [None]:
from matplotlib import pyplot

pyplot.hist(holistic['tech_consumer_personality'].dropna())
plt.xlabel('tech_consumer_personality')
plt.ylabel('count')
plt.savefig('normal_x.svg')
pyplot.show()

In [None]:
from scipy.stats import levene
stat, p = levene(holistic_non_lgbtq['tech_consumer_personality'],
                    holistic_lgbtq['tech_consumer_personality'])
p

In [None]:
from scipy import stats
stats.ttest_ind(holistic_non_lgbtq['tech_consumer_personality'],
                holistic_lgbtq['tech_consumer_personality'],
                equal_var=False)

In [None]:
group1 = holistic_lgbtq['tech_consumer_personality'].dropna()
group2 = holistic_non_lgbtq['tech_consumer_personality'].dropna()
delta = group1.mean()-group2.mean()
effectsize = delta/np.std(holistic1['tech_consumer_personality'].dropna())
round(effectsize, 5)

# Chapter 4 - Bivariate Statistics {.unnumbered}

## 4.1 Frequencies {.unnumbered}

In [None]:
# Bivariate frequencies LGBTQ+ association
lgbtq = pd.DataFrame(columns=['country',
                              'lgbtq_associated',
                              '% of pop.',
                              'lgbtq_orientation',
                              '% of pop.',
                              'delta'])

for data in all_dfs:
    df = data[0]
    df.replace([999,-99], np.nan)
    df = df.copy()
    country = data[1]
    lgbtq_associated = (df['LGBTQ+ community'] < 2)
    lgbtq_orientation = (df['Sexual orientation'] > 1)
    assoc_count = len(df[lgbtq_associated])
    assoc_rate = (assoc_count/len(df))*100
    orient_count = len(df[lgbtq_orientation])
    orient_rate = (orient_count/len(df))*100
    delta = assoc_count - orient_count
    sed = [country,
           assoc_count,
           round(assoc_rate,2),
           orient_count,
           round(orient_rate,2),
           delta]
    lgbtq.loc[len(lgbtq)] = sed
lgbtq

In [None]:
# Scoring-model happiness - ZIS based
# v9995_demo_incomepersonal
# v8880_demo_optimism
for data in all_dfs:
    data[0]['Personal economic situation (detailed)'].replace([998],
                                                            np.nan,
                                                            inplace=True)
    data[0]['Personal economic situation (detailed)'].replace({5:1,1:5,4:2,2:4},
                                                            inplace=True
                                                            )
    data[0]['View on personal future (detailed)'].replace([998],
                                                        np.nan,
                                                        inplace=True)
    data[0]['View on personal future (detailed)'].replace({5:1,1:5,4:2,2:4},
                                                        inplace=True
                                                        )
    future_score = 0.8*(data[0]['View on personal future (detailed)'])
    monetary_score = 0.2*(data[0]['Personal economic situation (detailed)'])
    data[0]['happiness_scale'] = future_score + monetary_score

In [None]:
# concatenate all df's
frames = [df[0].set_index('Sexual orientation') for df in all_dfs 
        if ('Sexual orientation' in df[0].columns)]
holistic = pd.concat(frames)
holistic = holistic.copy()
holistic.reset_index(inplace=True)

In [None]:
from empiricaldist import Cdf

cdf_holistic = Cdf.from_seq(holistic['happiness_scale'], normalize= True)
plt.xlabel('happiness_scale')
plt.ylabel('CDF')
plt.title('Step graph of CDF based on happiness_scale')
cdf_holistic.step()
plt.show()

In [None]:
cdf_lgbtq = Cdf.from_seq(holistic_lgbtq['happiness_scale'],
                                        normalize= True)
cdf_non_lgbtq = Cdf.from_seq(holistic_non_lgbtq['happiness_scale'],
                                        normalize= True)
plt.xlabel('happiness_scale')
plt.ylabel('CDF')
plt.title('CDF based on happiness_scale, separated by "Sexual orientation"')

cdf_lgbtq.plot(label='LGBTQ+')
cdf_non_lgbtq.plot(label='Non-LGBTQ+')
plt.legend()
plt.show()

## 4.2 Location and Dispersion Parameters {.unnumbered}

In [None]:
codes = decoding.loc[decoding['ID']=='v9990b_demo_income3deu']
def lookup_class(cell):
    match = pd.to_numeric(codes['CodeSP'])==int(cell)
    labels = codes['ValueLabel'][match]
    if labels.empty:
        print(str(cell))
        return 0
    else:
        return labels.values[0]

In [None]:
df_germany = df_germany.sort_values(by='Annual household income (in US$) - Germany')

In [None]:
df_germany['Annual household income (in US$) - Germany'] = df_germany['Annual household income (in US$) - Germany'].apply(lookup_class)

In [None]:
df_germany['Annual household income (in US$) - Germany'].unique()

In [None]:
values_d = df_germany['Annual household income (in US$) - Germany'].unique()
df_germany['happiness_scale'].dropna()
for value in values_d:
    cut = df_germany['Annual household income (in US$) - Germany']==value
    df_germany['happiness_scale'].dropna()
    print(value)
    print(df_germany['happiness_scale'][cut].describe())

In [None]:
fig = px.box(df_germany,
            x='Annual household income (in US$) - Germany',
            y='happiness_scale',
            title='grouped box plot of salary and happiness_scale',
            width=980,
            height=700)

fig.update_xaxes(tickfont_size=14)
fig.update_yaxes(tickfont_size=14)
fig.show()

In [None]:
all_dfs[4][0]['Annual household income (in US$) - Germany'].replace([999],
                                                                np.nan,
                                                                inplace=True)
all_dfs[4][0]['happiness_scale'].replace([999],
                                         np.nan,
                                         inplace=True)
all_dfs[4][0]['happiness_scale']=np.where(all_dfs[4][0]['happiness_scale']<2,
                                            np.nan,
                                            all_dfs[4][0]['happiness_scale']+0)
all_dfs[4][0]['happiness_scale']=np.where(all_dfs[4][0]['happiness_scale']>4.5,
                                            np.nan,
                                            all_dfs[4][0]['happiness_scale']+0)

In [None]:
import scipy
from scipy import signal

income_ger = all_dfs[4][0]['Annual household income (in US$) - Germany']
happiness_ger = all_dfs[4][0]['happiness_scale']
scipy.stats.spearmanr(income_ger,
                    happiness_ger,
                    nan_policy='omit')

## 4.3 Regressions {.unnumbered}

In [None]:
happiness_scale = holistic['happiness_scale'].replace(np.nan,
                                holistic['happiness_scale'].mean())
tech_consumer_personality = holistic[
                'tech_consumer_personality'].replace(
                np.nan,holistic['tech_consumer_personality'].mean())
print(happiness_scale.describe())
print(tech_consumer_personality.describe())

In [None]:
plt.plot(tech_consumer_personality,
         happiness_scale,
         'o',
         markersize=3,
         alpha=0.01)
plt.xlabel('tech_consumer_personality')
plt.ylabel('happiness_scale')

In [None]:
from scipy.stats import linregress
res = linregress(happiness_scale, tech_consumer_personality)
res

In [None]:
fx = np.array([happiness_scale.min(),
               happiness_scale.max()])
fy= res.intercept + (res.slope * fx)
plt.plot(fx, fy, '-')
plt.show()

In [None]:
fx = np.array([tech_consumer_personality.min(),
               tech_consumer_personality.max()])
fy= res.intercept + (res.slope * fx)
plt.plot(fx, fy, '-')
plt.show()

In [None]:
happiness_scale.describe()

In [None]:
tech_consumer_personality.describe()

In [None]:
from scipy import stats
stats.pearsonr(happiness_scale, tech_consumer_personality)

In [None]:
print(f"R-squared: {res.rvalue**2:.6f}")

### _Source: Own_ {.unnumbered}