In [None]:
# import all relevant libraries --> can't use utils since it is a jupyter notebook
import streamlit as st
import pandas as pd
from pathlib import Path
import plotly.express as px
import numpy as np
from linearmodels.panel import PanelOLS, compare


In [7]:

# --- Use your established path definitions ---
CWD = Path.cwd()
ROOT = CWD.parent

IN_RAW = ROOT / "app" / "data" / "output" / "question1" # Corrected to point to question1
OUT_CLEAN = ROOT / "app" / "data" / "output"
OUT_TABLES = ROOT / "app" / "data" / "tables"
OUT_QUESTION2 = ROOT / "app" / "data" / "output" / "question2p_paneldata"

# Create directories
IN_RAW.mkdir(parents=True, exist_ok=True)
OUT_CLEAN.mkdir(parents=True, exist_ok=True)
OUT_TABLES.mkdir(parents=True, exist_ok=True)
OUT_QUESTION2.mkdir(parents=True, exist_ok=True) # For saving the new results

print(f"Project Root: {ROOT}")
print(f"Data Input Path: {IN_RAW}")
print(f"Panel Data Output Path: {OUT_QUESTION2}")


Project Root: c:\Users\dirai\Documents\GitHub\kldr-project
Data Input Path: c:\Users\dirai\Documents\GitHub\kldr-project\app\data\output\question1
Panel Data Output Path: c:\Users\dirai\Documents\GitHub\kldr-project\app\data\output\question2p_paneldata


In [9]:
df_panel = pd.read_csv(IN_RAW/"final_panel_for_regression.csv")

df_panel = df_panel.set_index(["country_code", "year"])


# Define dependent and independent variables
Y = np.log(df_panel['Exports_Digital_Service'] + 1)
X = df_panel[['internet_usage_pct', 'gdp_per_capita', 'population']]
X['log_gdp_per_capita'] = np.log(X['gdp_per_capita'] + 1)
X['log_population'] = np.log(X['population'] + 1)
X = X[['internet_usage_pct', 'log_gdp_per_capita', 'log_population']]

# Add a constant for the Pooled OLS model
from statsmodels.api import add_constant
X_pooled = add_constant(X)

print("Data prepared for Panel Data Analysis.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['log_gdp_per_capita'] = np.log(X['gdp_per_capita'] + 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['log_population'] = np.log(X['population'] + 1)


Data prepared for Panel Data Analysis.


In [11]:
# --- 1. Pooled OLS (the baseline, flawed model) ---
mod_pooled = PanelOLS(Y, X_pooled)
res_pooled = mod_pooled.fit()

# --- 2. Country Fixed Effects Model ---
# We add `entity_effects=True` to control for each country
mod_fe = PanelOLS(Y, X, entity_effects=True)
res_fe = mod_fe.fit()

# --- 3. Two-way Fixed Effects Model (Country + Time) ---
# We add `time_effects=True` to also control for global year-shocks
mod_twfe = PanelOLS(Y, X, entity_effects=True, time_effects=True)
res_twfe = mod_twfe.fit()

print("All three models have been estimated.")


All three models have been estimated.


In [15]:
# --- Create and save the comparison table ---
from linearmodels.panel import compare
results_dict = {
    "Pooled OLS": res_pooled,
    "Country FE": res_fe,
    "Two-way FE": res_twfe
}
comparison_table = compare(results_dict)

# 1. Convert the comparison table object to a string
#    The str() function will render it as a formatted text table.
table_as_string = str(comparison_table)

# 2. Wrap the string in HTML <pre> tags.
#    The <pre> (preformatted text) tag is essential. It tells the browser
#    to respect all the spaces and line breaks, keeping the table's columns aligned.
html_output = f"<pre>{table_as_string}</pre>"

# 3. Define the output path
comparison_html_path = OUT_QUESTION2 / "panel_models_comparison.html"

# 4. Write the new HTML string to the file.
with open(comparison_html_path, "w") as f:
    f.write(html_output)

print(f"\nComparison table saved to: {comparison_html_path}")

# Display the original table object in the notebook to check it
comparison_table


Comparison table saved to: c:\Users\dirai\Documents\GitHub\kldr-project\app\data\output\question2p_paneldata\panel_models_comparison.html


0,1,2,3
,Pooled OLS,Country FE,Two-way FE
Dep. Variable,Exports_Digital_Service,Exports_Digital_Service,Exports_Digital_Service
Estimator,PanelOLS,PanelOLS,PanelOLS
No. Observations,2367,2367,2367
Cov. Est.,Unadjusted,Unadjusted,Unadjusted
R-squared,0.6306,0.0236,0.0061
R-Squared (Within),-0.0006,0.0236,-0.0104
R-Squared (Between),0.6756,-0.1336,-1.3808
R-Squared (Overall),0.6306,-0.1351,-1.3840
F-statistic,1344.7,17.507,4.4337
