In [1]:
from semopy import Model, report
from semopy import semplot
import pandas as pd

from tools.preprocessing_data import encode_data_to_numeric, get_data_since_date, load_data, min_max_scale_data, fill_nan_individually
from tools.add_external_data import add_external_data

In [25]:
model = """
    # Latent Variables:
    Information_Awareness =~ F5aA1_1 + F5aA2_1 + F5aA3_1 + F5bA1_1 + F5bA2_1 + F5bA3_1 + F5bA4_1 + F5bA5_1 + F5A10_1 + F5A11_1 + F5A12_1 + F5A13_1 + F5A14_1

    Investment_Behavior =~ F3A21_1 + F5A10_2
    
    Energy_Crisis_Sentiment =~ F1A13_1 + F1A14_1
    
    Economical_Indices =~ inflation_rate + interest_rate + dax_points + MSCI_world

    Ukraine_Sentiment =~ F2A14 + F2A6

    Corona_Sentiment =~ F3A16_1 + F3A17_1
        
    # Regression:
    Investment_Behavior ~ Information_Awareness + Energy_Crisis_Sentiment + Ukraine_Sentiment + Corona_Sentiment
    Economical_Indices ~ Energy_Crisis_Sentiment + Ukraine_Sentiment + Corona_Sentiment
   
    # Correlations:
    inflation_rate ~~ dax_points
    inflation_rate ~~ interest_rate
    dax_points ~~ interest_rate
    MSCI_world ~~ dax_points
    MSCI_world ~~ interest_rate
    MSCI_world ~~ inflation_rate
    F3A21_1 ~~ F5A10_2
    F5bA4_1 ~~ F5aA1_1
    F5bA2_1 ~~ F5aA3_1
    F5bA3_1 ~~ F5bA2_1
    F5A11_1 ~~ F5A10_1
    F5A10_1 ~~ F5A13_1
    F5A13_1 ~~ F5A11_1
    F3A17_1 ~~ F3A16_1
    F1A13_1 ~~ F1A14_1
    
"""

data = load_data()
data = add_external_data(data)
data = get_data_since_date(data, '2023-04-05')

cutoff_date = pd.Timestamp('2024-04-06')

# Current year
current_year = 2024

# Update F7cA1 values where I_START is on or after the cutoff date
data.loc[data['i_START'] >= cutoff_date, 'F7cA1'] = (current_year - data.loc[data['i_START'] >= cutoff_date, 'F7cA1'])

# Calculate quantiles for splitting
q1, q2 = data['F7cA1'].quantile([1/3, 2/3])

# Assign groups based on the quantiles
data['Group'] = pd.cut(
    data['F7cA1'], 
    bins=[-float('inf'), q1, q2, float('inf')], 
    labels=['1', '2', '3']
)

data = encode_data_to_numeric(data)

relevant_columns = ['inflation_rate', 'interest_rate', 'dax_points', 'MSCI_world', 'F3A21_1', 'F5A10_2', 'F5aA1_1', 'F5aA2_1', 'F5aA3_1', 'F5bA1_1', 'F5bA2_1', 'F5bA3_1', 'F5bA4_1', 'F5bA5_1', 'F5A10_1', 'F5A11_1', 'F5A12_1', 'F5A13_1', 'F5A14_1', 'F3A16_1', 'F3A17_1', 'F2A6', 'F2A14', 'F1A13_1', 'F1A14_1', 'Group']

data = data[relevant_columns]
data['inflation_rate'] = -data['inflation_rate']
data['F1A14_1'] = -data['F1A14_1']

data = fill_nan_individually(data)

i = 1

ages = sorted(data['Group'].unique())

for age in ages:
    # Filter the data for the current income class
    class_data = data[data['Group'] == age]
    mod = Model(model)
    class_data = min_max_scale_data(class_data)

    class_data.drop('Group', axis=1, inplace=True)

    r=mod.fit(class_data)
    
    if i == 1:
        param = mod.inspect()[['lval', 'op', 'rval', 'Estimate']]
        df = pd.DataFrame(param[param['op'] != '~~'])
        df.rename(columns={'Estimate': 'Estimate_' + str(i)}, inplace=True)
    else:
        new = mod.inspect()[['Estimate', 'op']]
        new_df = pd.DataFrame(new[new['op'] != '~~'])
        df['Estimate' + str(i)] = new_df['Estimate']

    g = semplot(mod, "../results/sem_age/model_" + str(i) + ".png")
    report(mod, "../results/sem_age/model_" + str(i) + "_report")
    i = i + 1

df.to_csv('/Users/inagege/Documents/00_Uni/SeminarSocialSentimentInTimesOfCrisis/results/sem_age/all_estimates', index=False)
print(df.head())

  return pd.read_csv("../Data/data_sample_700_SOSEC_dataset_germany.csv")
[*********************100%***********************]  1 of 1 completed


                  lval op                     rval  Estimate_1  Estimate2  \
0  Investment_Behavior  ~    Information_Awareness   -0.258604   0.471526   
1  Investment_Behavior  ~  Energy_Crisis_Sentiment   -0.109848  -0.559867   
2  Investment_Behavior  ~        Ukraine_Sentiment    0.069605  -0.247489   
3  Investment_Behavior  ~         Corona_Sentiment    1.969169  -0.825346   
4   Economical_Indices  ~  Energy_Crisis_Sentiment    0.843091   0.224386   

   Estimate3  
0   9.867430  
1  -0.358676  
2  -0.029332  
3   0.023937  
4  -0.013331  


In [6]:
model = """
    # Latent Variables:
    Information_Awareness =~ F5aA1_1 + F5aA2_1 + F5aA3_1 + F5bA1_1 + F5bA2_1 + F5bA3_1 + F5bA4_1 + F5bA5_1 + F5A10_1 + F5A11_1 + F5A12_1 + F5A13_1 + F5A14_1

    Investment_Behavior =~ F3A21_1 + F5A10_2
    
    Energy_Crisis_Sentiment =~ F1A13_1 + F1A14_1
    
    Economical_Indices =~ inflation_rate + interest_rate + dax_points + MSCI_world

    Ukraine_Sentiment =~ F2A14 + F2A6

    Corona_Sentiment =~ F3A16_1 + F3A17_1
        
    # Regression:
    Investment_Behavior ~ Information_Awareness + Energy_Crisis_Sentiment + Ukraine_Sentiment + Corona_Sentiment
    Economical_Indices ~ Energy_Crisis_Sentiment + Ukraine_Sentiment + Corona_Sentiment
   
    # Correlations:
    inflation_rate ~~ dax_points
    inflation_rate ~~ interest_rate
    dax_points ~~ interest_rate
    MSCI_world ~~ dax_points
    MSCI_world ~~ interest_rate
    MSCI_world ~~ inflation_rate
    F3A21_1 ~~ F5A10_2
    F5bA4_1 ~~ F5aA1_1
    F5bA2_1 ~~ F5aA3_1
    F5bA3_1 ~~ F5bA2_1
    F5A11_1 ~~ F5A10_1
    F5A10_1 ~~ F5A13_1
    F5A13_1 ~~ F5A11_1
    F3A17_1 ~~ F3A16_1
    F1A13_1 ~~ F1A14_1
    
"""

data = load_data()
data = add_external_data(data)
data = get_data_since_date(data, '2023-04-05')
data = encode_data_to_numeric(data)

relevant_columns = ['inflation_rate', 'interest_rate', 'dax_points', 'MSCI_world', 'F3A21_1', 'F5A10_2', 'F5aA1_1', 'F5aA2_1', 'F5aA3_1', 'F5bA1_1', 'F5bA2_1', 'F5bA3_1', 'F5bA4_1', 'F5bA5_1', 'F5A10_1', 'F5A11_1', 'F5A12_1', 'F5A13_1', 'F5A14_1', 'F3A16_1', 'F3A17_1', 'F2A6', 'F2A14', 'F1A13_1', 'F1A14_1', 'state']

data = data[relevant_columns]
data['inflation_rate'] = -data['inflation_rate']
data['F1A14_1'] = -data['F1A14_1']

data = load_data()
data = add_external_data(data)
data = get_data_since_date(data, '2023-04-05')

data = fill_nan_individually(data)

states_new = ['Sachsen-Anhalt', 'Thüringen', 'Sachsen', 'Mecklenburg-Vorpommern', 'Brandenburg']

states_old = ['Bayern', 'Baden-Württemberg', 'Nordrhein-Westfalen', 'Niedersachsen', 'Schleswig-Holstein', 'Berlin',
              'Rheinland-Pfalz', 'Bremen', 'Hessen', 'Saarland', 'Hamburg']

data['old_new'] = data['state'].apply(lambda x: 'new' if x in states_new else 'old' if x in states_old else 'unknown')

data = encode_data_to_numeric(data)

relevant_columns = ['inflation_rate', 'interest_rate', 'dax_points', 'MSCI_world', 'F3A21_1', 'F5A10_2', 'F5aA1_1',
                    'F5aA2_1', 'F5aA3_1', 'F5bA1_1', 'F5bA2_1', 'F5bA3_1', 'F5bA4_1', 'F5bA5_1', 'F5A10_1', 'F5A11_1',
                    'F5A12_1', 'F5A13_1', 'F5A14_1', 'F3A16_1', 'F3A17_1', 'F2A6', 'F2A14', 'F1A13_1', 'F1A14_1',
                    'old_new']

data = data[relevant_columns]
data['inflation_rate'] = -data['inflation_rate']
data['F1A14_1'] = -data['F1A14_1']

states = data['old_new'].unique()

i = 1

for state in states:
    # Filter the data for the current income class
    class_data = data[data['old_new'] == state]
    class_data.drop('old_new', inplace=True, axis=1)
    mod = Model(model)
    class_data = min_max_scale_data(class_data)

    r=mod.fit(class_data)
    
    if i == 1:
        param = mod.inspect()[['lval', 'op', 'rval', 'Estimate']]
        df = pd.DataFrame(param[param['op'] != '~~'])
        df.rename(columns={'Estimate': 'Estimate_new'}, inplace=True)
    else:
        new = mod.inspect()[['Estimate', 'op']]
        new_df = pd.DataFrame(new[new['op'] != '~~'])
        df['Estimate_old'] = new_df['Estimate']

    g = semplot(mod, "../results/sem_states/model_" + state + ".png")
    report(mod, "../results/sem_states/model_" + state + "_report")
    i = i + 1

df.to_csv('/Users/inagege/Documents/00_Uni/SeminarSocialSentimentInTimesOfCrisis/results/sem_states/all_estimates', index=False)
print(df.head())

  return pd.read_csv("../Data/data_sample_700_SOSEC_dataset_germany.csv")
[*********************100%***********************]  1 of 1 completed
  return pd.read_csv("../Data/data_sample_700_SOSEC_dataset_germany.csv")
[*********************100%***********************]  1 of 1 completed
  data.fillna({col: data[col].mode()[0]}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_data.drop('old_new', inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_data.drop('old_new', inplace=True, axis=1)


                  lval op                     rval  Estimate_new  Estimate_old
0  Investment_Behavior  ~    Information_Awareness      0.296403      0.397266
1  Investment_Behavior  ~  Energy_Crisis_Sentiment     -1.385547     -0.764664
2  Investment_Behavior  ~        Ukraine_Sentiment     -0.534650     -0.278712
3  Investment_Behavior  ~         Corona_Sentiment     -1.022713     -0.131047
4   Economical_Indices  ~  Energy_Crisis_Sentiment      0.573383      0.078301


In [8]:
for state in states:
    print(data[data['old_new'] == state].shape, state)

(21097, 26) old
(3227, 26) new


In [51]:
model = """
    # Latent Variables:
    Information_Awareness =~ F5aA1_1 + F5aA2_1 + F5aA3_1 + F5bA1_1 + F5bA2_1 + F5bA3_1 + F5bA4_1 + F5bA5_1 + F5A10_1 + F5A11_1 + F5A12_1 + F5A13_1 + F5A14_1

    Investment_Behavior =~ F3A21_1 + F5A10_2
    
    Energy_Crisis_Sentiment =~ F1A13_1 + F1A14_1
    
    Economical_Indices =~ inflation_rate + interest_rate + dax_points + MSCI_world

    Ukraine_Sentiment =~ F2A14 + F2A6

    Corona_Sentiment =~ F3A16_1 + F3A17_1
        
    # Regression:
    Investment_Behavior ~ Information_Awareness + Energy_Crisis_Sentiment + Ukraine_Sentiment + Corona_Sentiment
    Economical_Indices ~ Energy_Crisis_Sentiment + Ukraine_Sentiment + Corona_Sentiment
   
    # Correlations:
    inflation_rate ~~ dax_points
    inflation_rate ~~ interest_rate
    dax_points ~~ interest_rate
    MSCI_world ~~ dax_points
    MSCI_world ~~ interest_rate
    MSCI_world ~~ inflation_rate
    F3A21_1 ~~ F5A10_2
    F5bA4_1 ~~ F5aA1_1
    F5bA2_1 ~~ F5aA3_1
    F5bA3_1 ~~ F5bA2_1
    F5A11_1 ~~ F5A10_1
    F5A10_1 ~~ F5A13_1
    F5A13_1 ~~ F5A11_1
    F3A17_1 ~~ F3A16_1
    F1A13_1 ~~ F1A14_1
    
"""

data = load_data()
data = add_external_data(data)
data = get_data_since_date(data, '2023-04-05')
data = encode_data_to_numeric(data)

relevant_columns = ['inflation_rate', 'interest_rate', 'dax_points', 'MSCI_world', 'F3A21_1', 'F5A10_2', 'F5aA1_1', 'F5aA2_1', 'F5aA3_1', 'F5bA1_1', 'F5bA2_1', 'F5bA3_1', 'F5bA4_1', 'F5bA5_1', 'F5A10_1', 'F5A11_1', 'F5A12_1', 'F5A13_1', 'F5A14_1', 'F3A16_1', 'F3A17_1', 'F2A6', 'F2A14', 'F1A13_1', 'F1A14_1', 'einkommen']

data = data[relevant_columns]
data['inflation_rate'] = -data['inflation_rate']
data['F1A14_1'] = -data['F1A14_1']

data = fill_nan_individually(data)

income_classes = sorted(data['einkommen'].unique())

# Define custom group boundaries
group1 = income_classes[:4]   
group2 = income_classes[4:7]   
group3 = income_classes[7:]

# Create a mapping for each income class to its group
new_income_classes = {}
for income in group1:
    new_income_classes[income] = 1
for income in group2:
    new_income_classes[income] = 2
for income in group3:
    new_income_classes[income] = 3
    
data['einkommen'] = data['einkommen'].map(new_income_classes)

income_classes = sorted(data['einkommen'].unique())

print(income_classes)

i = 1

for income_class in income_classes:
    # Filter the data for the current income class
    class_data = data[data['einkommen'] == income_class]
    mod = Model(model)
    class_data = min_max_scale_data(class_data)

    r=mod.fit(class_data)
    
    if i == 1:
        param = mod.inspect()[['lval', 'op', 'rval', 'Estimate']]
        df = pd.DataFrame(param[param['op'] != '~~'])
        df.rename(columns={'Estimate': 'Estimate1'}, inplace=True)
    else:
        new = mod.inspect()[['Estimate', 'op']]
        new_df = pd.DataFrame(new[new['op'] != '~~'])
        df['Estimate' + str(i)] = new_df['Estimate']

    g = semplot(mod, "../results/sem_income/model" + str(i) + ".png")
    report(mod, "../results/sem_income/model" + str(i) + "_report")
    i = i + 1

df.to_csv('/Users/inagege/Documents/00_Uni/SeminarSocialSentimentInTimesOfCrisis/results/sem_income/all_estimates', index=False)
print(df.head())

  return pd.read_csv("../Data/data_sample_700_SOSEC_dataset_germany.csv")
[*********************100%***********************]  1 of 1 completed


[np.int64(1), np.int64(2), np.int64(3)]




                  lval op                     rval  Estimate1  Estimate2  \
0  Investment_Behavior  ~    Information_Awareness   0.745242   0.463720   
1  Investment_Behavior  ~  Energy_Crisis_Sentiment  -0.448005  -0.020900   
2  Investment_Behavior  ~        Ukraine_Sentiment  -0.178904  -0.066613   
3  Investment_Behavior  ~         Corona_Sentiment  -0.993970  -1.385248   
4   Economical_Indices  ~  Energy_Crisis_Sentiment   0.702140  -0.012221   

   Estimate3  
0   2.044937  
1  -2.589970  
2  -1.498866  
3  -9.225078  
4   0.721632  
