In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from missforest.missforest import MissForest
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error

In [2]:
qog = pd.read_csv('https://www.qogdata.pol.gu.se/data/qog_std_cs_jan23.csv') 

In [3]:
qog.shape

(194, 1685)

In [4]:
qog['target_death'] = qog['jht_ccd'] / qog['wdi_pop']
qog['target_case'] = qog['jht_ccc'] / qog['wdi_pop']

In [5]:
drop_condition = (qog['jht_ccc'].isna()) | (qog['jht_ccd'].isna()) | (qog['target_case'].isna()) | (qog['target_death'].isna())
dropped_rows = qog[drop_condition]
dropped_rows

Unnamed: 0,ccode,cname,ccode_qog,cname_qog,ccodealp,ccodecow,version,aii_acc,aii_aio,aii_cilser,...,yri_fem61,yri_meanage,yri_medianage,yri_mp30,yri_mp35,yri_mp40,yri_mp4160,yri_mp61,target_death,target_case
37,158,Taiwan (Province of China),158,Taiwan,TWN,713.0,QoGStdCSjan23,,,,...,23.4,54.900002,55.0,0.9,0.6,6.2,66.400002,27.4,,
178,795,Turkmenistan,795,Turkmenistan,TKM,701.0,QoGStdCSjan23,,,,...,,,,,,,,,,


In [6]:
qog = qog.drop(qog[drop_condition].index)

In [7]:
qog.shape

(192, 1687)

In [8]:
# Only keep the non-zero states
drop_condition = (qog['jht_ccc'] == 0) | (qog['jht_ccd'] == 0) | (qog['target_case'] == 0) | (qog['target_death'] == 0)
dropped_rows = qog[drop_condition]
# Display the 'cname' column from the dropped_rows DataFrame
dropped_rows['cname']

16                                          Bhutan
22                                 Solomon Islands
28                                        Cambodia
49                                        Dominica
66                                        Kiribati
68                                         Grenada
89     Korea (the Democratic People's Republic of)
93          Lao People's Democratic Republic (the)
119                                          Nauru
122                                        Vanuatu
128               Micronesia (Federated States of)
129                               Marshall Islands
130                                          Palau
140                                    Timor-Leste
145                          Saint Kitts and Nevis
147               Saint Vincent and the Grenadines
153                                     Seychelles
173                                          Tonga
179                                         Tuvalu
191                            

In [9]:
qog = qog.drop(qog[drop_condition].index)

In [10]:
qog.shape

(172, 1687)

In [11]:
qog_comp = qog

In [12]:
# Drop out columns with any missing data 
qog_comp_cleaned = qog_comp.dropna(axis=1, how='any')

In [13]:
qog_comp_cleaned

Unnamed: 0,ccode,cname,ccode_qog,cname_qog,ccodealp,version,bmr_dem,bmr_dembr,bmr_demdur,bmr_demfsuf,...,wdi_popden,wdi_popf,wdi_popgr,wdi_poprul,wdi_popurb,wdi_popurbagr,wdi_tele,wdi_wip,target_death,target_case
0,4,Afghanistan,4,Afghanistan,AFG,QoGStdCSjan23,0,0,220,0,...,57.908253,49.455486,2.908529,74.246002,25.754000,3.919287,0.356468,27.868853,0.000058,0.001386
1,8,Albania,8,Albania,ALB,QoGStdCSjan23,1,1,23,1,...,104.167557,49.939331,-0.426007,38.771000,61.229000,1.071414,8.450550,29.508196,0.000414,0.020432
2,12,Algeria,12,Algeria,DZA,QoGStdCSjan23,0,0,58,0,...,17.930315,49.053471,1.839445,26.811001,73.189003,2.607530,10.853945,25.757576,0.000065,0.002332
3,20,Andorra,20,Andorra,AND,QoGStdCSjan23,1,0,26,1,...,162.431915,49.123035,1.757491,12.016000,87.984001,1.669358,51.945824,46.428570,0.001100,0.105432
4,24,Angola,24,Angola,AGO,QoGStdCSjan23,0,0,45,0,...,25.951382,50.598881,3.395278,33.823002,66.177002,4.402191,0.385509,30.000000,0.000013,0.000543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,858,Uruguay,858,Uruguay,URY,QoGStdCSjan23,1,2,35,1,...,19.588671,51.605919,0.039881,4.574000,95.426003,0.136357,33.991665,19.191919,0.000053,0.005577
189,860,Uzbekistan,860,Uzbekistan,UZB,QoGStdCSjan23,0,0,29,0,...,76.222832,49.979244,1.876470,49.567001,50.432999,1.787284,10.775133,16.000000,0.000018,0.002295
190,862,Venezuela (Bolivarian Republic of),862,Venezuela,VEN,QoGStdCSjan23,0,1,15,0,...,32.845852,50.414093,-2.904996,11.760000,88.239998,-2.868725,18.470837,22.155689,0.000035,0.003920
192,887,Yemen,887,Yemen,YEM,QoGStdCSjan23,0,0,30,0,...,59.750916,49.428680,2.426208,62.727001,37.272999,4.133613,3.930682,0.332226,0.000019,0.000067


In [14]:
qog_comp_cleaned.to_csv('qog_complete.csv')

In [15]:
from sklearn.linear_model import Lasso, MultiTaskLasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Data Preprocessing
# Assuming no missing values as per initial inspection
# Selecting features (excluding target variables and non-numeric columns)
features = qog_comp_cleaned.select_dtypes(include=[np.number]).drop(columns=['target_death', 'target_case', 'ccode', 'ccode_qog', 'jht_ccd', 'jht_ccc', 'wdi_pop'])

# Split data into training and testing sets for single-output models
X_train, X_test, y_train_death, y_test_death = train_test_split(features, qog_comp_cleaned['target_death'], test_size=0.3, random_state=0)
X_train, X_test, y_train_case, y_test_case = train_test_split(features, qog_comp_cleaned['target_case'], test_size=0.3, random_state=0)

# Single-output LASSO model for 'target_death'
lasso_death = Lasso(alpha=0.1).fit(X_train, y_train_death)
predictions_death = lasso_death.predict(X_test)
mse_death = mean_squared_error(y_test_death, predictions_death)
r2_d = r2_score(y_test_death, predictions_death)

# Single-output LASSO model for 'target_case'
lasso_case = Lasso(alpha=0.1).fit(X_train, y_train_case)
predictions_case = lasso_case.predict(X_test)
mse_case = mean_squared_error(y_test_case, predictions_case)
r2_c = r2_score(y_test_case, predictions_case)

# Multi-output LASSO model
Y = qog[['target_death', 'target_case']]
X_train_multi, X_test_multi, Y_train_multi, Y_test_multi = train_test_split(features, Y, test_size=0.3, random_state=0)
lasso_multi = MultiTaskLasso(alpha=0.1).fit(X_train_multi, Y_train_multi)
predictions_multi = lasso_multi.predict(X_test_multi)
mse_multi = mean_squared_error(Y_test_multi, predictions_multi, multioutput='raw_values')
r2_m = r2_score(Y_test_multi, predictions_multi)

# Display the eval for each model
print("MSE for LASSO with 'target_death':", mse_death, "R² Score:", r2_d)
print("MSE for LASSO with 'target_case':", mse_case, "R² Score:", r2_c)
print("MSE for Multi-output LASSO:", mse_multi, "R² Score:", r2_m)



MSE for LASSO with 'target_death': 1.3261363403191183e-07 R² Score: -0.013331570855458219
MSE for LASSO with 'target_case': 0.0003201809542918335 R² Score: 0.1571070629962037
MSE for Multi-output LASSO: [1.12513905e-07 3.20198224e-04] R² Score: 0.14865836351841005


Note, the negative R^2 is a strong indication that the model assumptions are violated, or that the model is overly complex, or poorly constructed for the given data. The random_state thus converted from 0 into 42. 

In [19]:
from sklearn.linear_model import Lasso, MultiTaskLasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Data Preprocessing
# Assuming no missing values as per initial inspection
# Selecting features (excluding target variables and non-numeric columns)
features = qog_comp_cleaned.select_dtypes(include=[np.number]).drop(columns=['target_death', 'target_case', 'ccode', 'ccode_qog', 'jht_ccd', 'jht_ccc', 'wdi_pop'])

# Split data into training and testing sets for single-output models
X_train, X_test, y_train_death, y_test_death = train_test_split(features, qog_comp_cleaned['target_death'], test_size=0.3, random_state=42)
X_train, X_test, y_train_case, y_test_case = train_test_split(features, qog_comp_cleaned['target_case'], test_size=0.3, random_state=42)

# Single-output LASSO model for 'target_death'
lasso_death = Lasso(alpha=0.1).fit(X_train, y_train_death)
predictions_death = lasso_death.predict(X_test)
mse_death = mean_squared_error(y_test_death, predictions_death)
r2_d = r2_score(y_test_death, predictions_death)

# Single-output LASSO model for 'target_case'
lasso_case = Lasso(alpha=0.1).fit(X_train, y_train_case)
predictions_case = lasso_case.predict(X_test)
mse_case = mean_squared_error(y_test_case, predictions_case)
r2_c = r2_score(y_test_case, predictions_case)

# Multi-output LASSO model
Y = qog[['target_death', 'target_case']]
X_train_multi, X_test_multi, Y_train_multi, Y_test_multi = train_test_split(features, Y, test_size=0.3, random_state=42)
lasso_multi = MultiTaskLasso(alpha=0.1).fit(X_train_multi, Y_train_multi)
predictions_multi = lasso_multi.predict(X_test_multi)
mse_multi = mean_squared_error(Y_test_multi, predictions_multi, multioutput='raw_values')
r2_m = r2_score(Y_test_multi, predictions_multi)

# Display the eval for each model
print("MSE for LASSO with 'target_death':", mse_death, "R² Score:", r2_d)
print("MSE for LASSO with 'target_case':", mse_case, "R² Score:", r2_c)
print("MSE for Multi-output LASSO:", mse_multi, "R² Score:", r2_m)

MSE for LASSO with 'target_death': 2.02490323635461e-07 R² Score: 0.04947993113582716
MSE for LASSO with 'target_case': 0.0003229277754894166 R² Score: 0.25651574326094007
MSE for Multi-output LASSO: [1.61436505e-07 3.22904335e-04] R² Score: 0.24938122402547463


In [33]:
import pandas as pd

# Assuming 'features' is a DataFrame containing the feature names

# Extracting non-zero coefficients for 'target_death'
non_zero_death = pd.DataFrame({
    'Feature': features.columns,
    'Coefficient': lasso_death.coef_
})
non_zero_death = non_zero_death[non_zero_death['Coefficient'] != 0]

# Extracting non-zero coefficients for 'target_case'
non_zero_case = pd.DataFrame({
    'Feature': features.columns,
    'Coefficient': lasso_case.coef_
})
non_zero_case = non_zero_case[non_zero_case['Coefficient'] != 0]

# For multi-output, we need to handle it differently as it involves multiple targets
non_zero_multi = pd.DataFrame(
    lasso_multi.coef_.T,  # Transpose to align with feature columns
    columns=['target_death_coeff', 'target_case_coeff'],
    index=features.columns
)
# Filter out rows where all coefficients are zero
non_zero_multi = non_zero_multi[(non_zero_multi != 0).any(axis=1)]

# Shared variables
shared_features = non_zero_death.merge(
    non_zero_case, on='Feature', how='inner').merge(
    non_zero_multi.reset_index()[['index']], left_on='Feature', right_on='index').drop('index', axis=1)

# 'shared_features' now contains the variables that are shared among all three regressions
print("Shared Variables among all three regressions:")
print(shared_features)


Shared Variables among all three regressions:
         Feature  Coefficient_x  Coefficient_y
0       wdi_area  -1.299060e-11  -1.388307e-09
1  wdi_migration   5.280805e-10   8.809501e-09


In [35]:
 non_zero_multi

Unnamed: 0,target_death_coeff,target_case_coeff
gendip_rec,1.557269e-06,7.070865e-05
ipu_l_s,-3.417109e-08,-5.208164e-06
wdi_acel,9.622688e-07,4.772297e-05
wdi_area,-2.226982e-11,-1.388315e-09
wdi_migration,4.063762e-10,8.808269e-09
wdi_mobile,6.413657e-07,5.572255e-05
wdi_popden,-2.615396e-08,-2.810794e-07
wdi_poprul,-2.265529e-06,-0.0001543021
wdi_tele,3.308789e-07,2.016142e-05


In [36]:
non_zero_death

Unnamed: 0,Feature,Coefficient
66,wdi_area,-1.29906e-11
70,wdi_migration,5.280805e-10


In [37]:
non_zero_case

Unnamed: 0,Feature,Coefficient
40,gendip_rec,7.069457e-05
44,ipu_l_s,-5.207249e-06
63,wdi_acel,4.76973e-05
66,wdi_area,-1.388307e-09
70,wdi_migration,8.809501e-09
71,wdi_mobile,5.574048e-05
75,wdi_popden,-2.805858e-07
78,wdi_poprul,-0.0001543145
81,wdi_tele,2.009453e-05


In [38]:
shared_features = non_zero_death.merge(
    non_zero_case, on='Feature', how='inner').merge(
    non_zero_multi.reset_index()[['index']], left_on='Feature', right_on='index').drop('index', axis=1)

In [39]:
shared_features

Unnamed: 0,Feature,Coefficient_x,Coefficient_y
0,wdi_area,-1.29906e-11,-1.388307e-09
1,wdi_migration,5.280805e-10,8.809501e-09


## RF

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

# Data Preprocessing

# Setup Random Forest models
rf_death = RandomForestRegressor(n_estimators=100, random_state=0)
rf_case = RandomForestRegressor(n_estimators=100, random_state=0)
# random_state being 0 yields better model performance

# Train models
rf_death.fit(X_train, y_train_death)
rf_case.fit(X_train, y_train_case)

# Model evaluation
def evaluate_model(model, X_test, y_test, title):
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    print(f"{title} Model Performance: R² Score = {r2:.4f}, MSE = {mse:.4f}")

# Evaluate and print the R² Score and MSE for each model
evaluate_model(rf_death, X_test, y_test_death, "Random Forest with 'target_death'")
evaluate_model(rf_case, X_test, y_test_case, "Random Forest with 'target_case'")



Random Forest with 'target_death' Model Performance: R² Score = 0.5629, MSE = 0.0000
Random Forest with 'target_case' Model Performance: R² Score = 0.4706, MSE = 0.0002


In [43]:
# Extract feature importances for the 'target_death' model and store in DataFrame
importances_death = rf_death.feature_importances_
features_death_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances_death
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

# Extract feature importances for the 'target_case' model and store in DataFrame
importances_case = rf_case.feature_importances_
features_case_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances_case
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

# If you want to limit to top 20 features, you can do so like this:
top_n = 20
top_features_death = features_death_df.head(top_n)
top_features_case = features_case_df.head(top_n)

In [45]:
top_features_death

Unnamed: 0,Feature,Importance
0,wdi_pop65,0.147748
1,gendip_afr,0.085934
2,une_tdurused,0.048919
3,ht_region,0.046763
4,egov_hci,0.038648
5,wdi_tele,0.037482
6,egov_egov,0.031883
7,wdi_acel,0.031072
8,wdi_pop14,0.0304
9,gendip_mfrp,0.022412


In [46]:
 top_features_case

Unnamed: 0,Feature,Importance
0,egov_egov,0.130359
1,wdi_pop14,0.091729
2,wdi_area,0.058051
3,wdi_acel,0.053964
4,gendip_mar,0.052608
5,wdi_pop65,0.048736
6,ht_region,0.047712
7,egov_tii,0.039231
8,wdi_death,0.032298
9,wdi_popurb,0.023032


In [48]:
shared_features = top_features_death.merge(top_features_case, on='Feature', suffixes=('_death', '_case'))
shared_features

Unnamed: 0,Feature,Importance_death,Importance_case
0,wdi_pop65,0.147748,0.048736
1,ht_region,0.046763,0.047712
2,egov_hci,0.038648,0.01526
3,wdi_tele,0.037482,0.014503
4,egov_egov,0.031883,0.130359
5,wdi_acel,0.031072,0.053964
6,wdi_pop14,0.0304,0.091729
7,wdi_death,0.021862,0.032298
8,wdi_popden,0.012966,0.013972


## Regression 

### Regression with all varaible sorted from LASSO

In [63]:
variables = non_zero_case['Feature'].tolist()
variables

['gendip_rec',
 'ipu_l_s',
 'wdi_acel',
 'wdi_area',
 'wdi_migration',
 'wdi_mobile',
 'wdi_popden',
 'wdi_poprul',
 'wdi_tele']

In [64]:
# Initialize lists to store results
results = []

# Loop through each variable and perform regression for 'target_death'
for var in variables:
    formula = f'target_death ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    # Extracting required statistics from the model summary
    for param in model.params.index:
        coef = model.params[param]
        std_err = model.bse[param]
        t_value = model.tvalues[param]
        p_value = model.pvalues[param]
        
        # Append results
        results.append({'Dependent Variable': 'target_death', 
                        'Independent Variable': var, 
                        'Coefficient': coef, 
                        'Std Err': std_err, 
                        't Value': t_value, 
                        'P Value': p_value})

regression_results_la_death = pd.DataFrame(results)
        
# Repeat for 'target_case'
results = []
for var in variables:
    formula = f'target_case ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    for param in model.params.index:
        coef = model.params[param]
        std_err = model.bse[param]
        t_value = model.tvalues[param]
        p_value = model.pvalues[param]
        
        results.append({'Dependent Variable': 'target_case', 
                        'Independent Variable': var, 
                        'Coefficient': coef, 
                        'Std Err': std_err, 
                        't Value': t_value, 
                        'P Value': p_value})
regression_results_la_case = pd.DataFrame(results)

In [65]:
regression_results_la_death

Unnamed: 0,Dependent Variable,Independent Variable,Coefficient,Std Err,t Value,P Value
0,target_death,gendip_rec,-0.0001191973,7.414061e-05,-1.607719,0.1097524
1,target_death,gendip_rec,5.727152e-06,8.605581e-07,6.655161,3.732574e-10
2,target_death,ipu_l_s,0.000335787,4.461481e-05,7.526357,2.907069e-12
3,target_death,ipu_l_s,-2.413212e-08,1.300591e-07,-0.185547,0.853021
4,target_death,wdi_acel,-0.0002577616,0.0001058497,-2.435165,0.0159183
5,target_death,wdi_acel,7.009384e-06,1.204913e-06,5.817335,2.900195e-08
6,target_death,wdi_area,0.0003238458,3.670516e-05,8.822896,1.303987e-15
7,target_death,wdi_area,8.892973e-12,1.772617e-11,0.501686,0.6165375
8,target_death,wdi_migration,0.000330015,3.329398e-05,9.912153,1.452403e-18
9,target_death,wdi_migration,5.723005e-10,1.806359e-10,3.168255,0.001818615


In [66]:
regression_results_la_case

Unnamed: 0,Dependent Variable,Independent Variable,Coefficient,Std Err,t Value,P Value
0,target_case,gendip_rec,-0.002412795,0.003456852,-0.697974,0.4861467
1,target_case,gendip_rec,0.0002533327,4.012406e-05,6.313736,2.291627e-09
2,target_case,ipu_l_s,0.01947767,0.002044833,9.525307,1.6687810000000003e-17
3,target_case,ipu_l_s,-9.098862e-06,5.961007e-06,-1.526397,0.1287694
4,target_case,wdi_acel,-0.01383041,0.004720633,-2.929779,0.003858058
5,target_case,wdi_acel,0.0003730659,5.373611e-05,6.942555,7.783106e-11
6,target_case,wdi_area,0.01768122,0.00169433,10.435519,5.145864e-20
7,target_case,wdi_area,-2.719109e-10,8.1825e-10,-0.332308,0.7400662
8,target_case,wdi_migration,0.01746003,0.001549395,11.268929,2.3577880000000003e-22
9,target_case,wdi_migration,2.222711e-08,8.406217e-09,2.644128,0.008956806


In [67]:
# Initialize lists to store results
results = []

# Loop through each variable and perform regression for 'target_death'
for var in variables:
    formula = f'target_death ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    # Extracting required statistics for the independent variable only
    coef = model.params[var]
    std_err = model.bse[var]
    t_value = model.tvalues[var]
    p_value = model.pvalues[var]
    
    # Append results
    results.append({'Dependent Variable': 'target_death', 
                    'Independent Variable': var, 
                    'Coefficient': coef, 
                    'Std Err': std_err, 
                    't Value': t_value, 
                    'P Value': p_value})

# Repeat for 'target_case'
for var in variables:
    formula = f'target_case ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    # Extracting required statistics for the independent variable only
    coef = model.params[var]
    std_err = model.bse[var]
    t_value = model.tvalues[var]
    p_value = model.pvalues[var]
    
    results.append({'Dependent Variable': 'target_case', 
                    'Independent Variable': var, 
                    'Coefficient': coef, 
                    'Std Err': std_err, 
                    't Value': t_value, 
                    'P Value': p_value})

# Convert to DataFrame
regression_results_la = pd.DataFrame(results)

In [68]:
regression_results_la

Unnamed: 0,Dependent Variable,Independent Variable,Coefficient,Std Err,t Value,P Value
0,target_death,gendip_rec,5.727152e-06,8.605581e-07,6.655161,3.732574e-10
1,target_death,ipu_l_s,-2.413212e-08,1.300591e-07,-0.185547,0.853021
2,target_death,wdi_acel,7.009384e-06,1.204913e-06,5.817335,2.900195e-08
3,target_death,wdi_area,8.892973e-12,1.772617e-11,0.501686,0.6165375
4,target_death,wdi_migration,5.723005e-10,1.806359e-10,3.168255,0.001818615
5,target_death,wdi_mobile,2.872427e-06,9.600244e-07,2.992035,0.00318383
6,target_death,wdi_popden,-1.9484e-08,2.250523e-08,-0.865755,0.387845
7,target_death,wdi_poprul,-6.968331e-06,1.409626e-06,-4.94339,1.829316e-06
8,target_death,wdi_tele,1.06494e-05,1.865193e-06,5.709543,4.948814e-08
9,target_case,gendip_rec,0.0002533327,4.012406e-05,6.313736,2.291627e-09


In [69]:
variables = shared_features['Feature'].tolist()
variables

['wdi_pop65',
 'ht_region',
 'egov_hci',
 'wdi_tele',
 'egov_egov',
 'wdi_acel',
 'wdi_pop14',
 'wdi_death',
 'wdi_popden']

In [70]:
# Initialize lists to store results
results = []

# Loop through each variable and perform regression for 'target_death'
for var in variables:
    formula = f'target_death ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    # Extracting required statistics from the model summary
    for param in model.params.index:
        coef = model.params[param]
        std_err = model.bse[param]
        t_value = model.tvalues[param]
        p_value = model.pvalues[param]
        
        # Append results
        results.append({'Dependent Variable': 'target_death', 
                        'Independent Variable': var, 
                        'Coefficient': coef, 
                        'Std Err': std_err, 
                        't Value': t_value, 
                        'P Value': p_value})

regression_results_rf_death = pd.DataFrame(results)
        
# Repeat for 'target_case'
results = []
for var in variables:
    formula = f'target_case ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    for param in model.params.index:
        coef = model.params[param]
        std_err = model.bse[param]
        t_value = model.tvalues[param]
        p_value = model.pvalues[param]
        
        results.append({'Dependent Variable': 'target_case', 
                        'Independent Variable': var, 
                        'Coefficient': coef, 
                        'Std Err': std_err, 
                        't Value': t_value, 
                        'P Value': p_value})
regression_results_rf_case = pd.DataFrame(results)

In [71]:
regression_results_rf_death

Unnamed: 0,Dependent Variable,Independent Variable,Coefficient,Std Err,t Value,P Value
0,target_death,wdi_pop65,2.238127e-05,4.956575e-05,0.451547,0.6521709
1,target_death,wdi_pop65,3.299553e-05,4.2679e-06,7.731093,8.909037e-13
2,target_death,ht_region,0.0005517938,6.568487e-05,8.400622,1.694592e-14
3,target_death,ht_region,-5.453477e-05,1.401889e-05,-3.890092,0.0001435871
4,target_death,egov_hci,-0.0003677895,0.0001084262,-3.392072,0.0008628257
5,target_death,egov_hci,0.001013128,0.0001509788,6.710399,2.769139e-10
6,target_death,wdi_tele,0.0001647021,4.275648e-05,3.852095,0.0001656928
7,target_death,wdi_tele,1.06494e-05,1.865193e-06,5.709543,4.948814e-08
8,target_death,egov_egov,-0.0002418997,9.091962e-05,-2.660589,0.008547446
9,target_death,egov_egov,0.000931833,0.0001394442,6.682481,3.220702e-10


In [72]:
regression_results_rf_case

Unnamed: 0,Dependent Variable,Independent Variable,Coefficient,Std Err,t Value,P Value
0,target_case,wdi_pop65,0.002398224,0.002236,1.072489,0.2850206
1,target_case,wdi_pop65,0.001614959,0.000193,8.387488,1.833908e-14
2,target_case,ht_region,0.0281212,0.003019,9.314336,6.246774000000001e-17
3,target_case,ht_region,-0.002622619,0.000644,-4.0701,7.185366e-05
4,target_case,egov_hci,-0.01916639,0.004804,-3.989268,9.833157e-05
5,target_case,egov_hci,0.05316784,0.00669,7.947312,2.516131e-13
6,target_case,wdi_tele,0.007852016,0.001859,4.223395,3.914086e-05
7,target_case,wdi_tele,0.0006183635,8.1e-05,7.624345,1.653565e-12
8,target_case,egov_egov,-0.01383961,0.003965,-3.490383,0.0006144492
9,target_case,egov_egov,0.05098504,0.006081,8.38396,1.873236e-14


In [73]:
# Initialize lists to store results
results = []

# Loop through each variable and perform regression for 'target_death'
for var in variables:
    formula = f'target_death ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    # Extracting required statistics for the independent variable only
    coef = model.params[var]
    std_err = model.bse[var]
    t_value = model.tvalues[var]
    p_value = model.pvalues[var]
    
    # Append results
    results.append({'Dependent Variable': 'target_death', 
                    'Independent Variable': var, 
                    'Coefficient': coef, 
                    'Std Err': std_err, 
                    't Value': t_value, 
                    'P Value': p_value})

# Repeat for 'target_case'
for var in variables:
    formula = f'target_case ~ {var}'
    model = smf.ols(formula, data=qog).fit()
    
    # Extracting required statistics for the independent variable only
    coef = model.params[var]
    std_err = model.bse[var]
    t_value = model.tvalues[var]
    p_value = model.pvalues[var]
    
    results.append({'Dependent Variable': 'target_case', 
                    'Independent Variable': var, 
                    'Coefficient': coef, 
                    'Std Err': std_err, 
                    't Value': t_value, 
                    'P Value': p_value})

# Convert to DataFrame
regression_results_rf = pd.DataFrame(results)

In [74]:
regression_results_rf

Unnamed: 0,Dependent Variable,Independent Variable,Coefficient,Std Err,t Value,P Value
0,target_death,wdi_pop65,3.299553e-05,4.2679e-06,7.731093,8.909037e-13
1,target_death,ht_region,-5.453477e-05,1.401889e-05,-3.890092,0.0001435871
2,target_death,egov_hci,0.001013128,0.0001509788,6.710399,2.769139e-10
3,target_death,wdi_tele,1.06494e-05,1.865193e-06,5.709543,4.948814e-08
4,target_death,egov_egov,0.000931833,0.0001394442,6.682481,3.220702e-10
5,target_death,wdi_acel,7.009384e-06,1.204913e-06,5.817335,2.900195e-08
6,target_death,wdi_pop14,-2.035918e-05,2.721619e-06,-7.48054,3.780248e-12
7,target_death,wdi_death,3.652414e-05,1.228034e-05,2.974197,0.003365009
8,target_death,wdi_popden,-1.9484e-08,2.250523e-08,-0.865755,0.387845
9,target_case,wdi_pop65,0.001614959,0.0001925438,8.387488,1.833908e-14


In [76]:
shared_features_larf = non_zero_case.merge(shared_features, on='Feature')

print("Shared Features between non_zero_case and shared_features:")
print(shared_features_larf[['Feature']])

Shared Features between non_zero_case and shared_features:
      Feature
0    wdi_acel
1  wdi_popden
2    wdi_tele


In [77]:
regression_results_rf.to_csv('non_imputed_rf.csv')

In [78]:
regression_results_la.to_csv('non_imputed_la.csv')