<a href="https://colab.research.google.com/github/kumpaten/masters-thesis-code/blob/main/DataPreprocessing_DescriptiveStatistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# STEP 1: Data Overview & Descriptive Statistics

import pandas as pd

# ✅ Load dataset
file_path = '/content/drive/MyDrive/Data_to_analyze/Final_Dataset_no_NAs.csv'  # Update path if needed
df = pd.read_csv(file_path, delimiter=';')

# ✅ Set panel index
df['year'] = df['year'].astype(int)
df.set_index(['company', 'year'], inplace=True)

# ✅ Convert dummy variable
df['is_imputed'] = df['is_imputed'].astype('category')

# ✅ Identify numeric variables, exclude dummies
exclude_vars = ['is_imputed']
numeric_df = df.select_dtypes(include=['float64', 'int64']).drop(columns=exclude_vars, errors='ignore')

# ✅ Multiply growth rates by 100 to express as %
growth_vars = ['delta_ln_S5INFT-1', 'delta_ln_GDPWorld-1']
for var in growth_vars:
    if var in numeric_df.columns:
        numeric_df[var] = numeric_df[var] * 100

# ✅ Descriptive stats table
desc_stats = numeric_df.describe().T[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]
desc_stats.columns = ['Mean', 'Std. Dev.', 'Min', '25%', 'Median', '75%', 'Max']
desc_stats = desc_stats.round(2)

# ✅ Rename variables to LaTeX/print-friendly labels
label_map = {
    'brand_value': 'Brand Value (USD m)',
    'patent_claims': 'Patent Claims',
    'TSR': 'TSR (%)',
    'Tobins_Q': 'Tobin’s Q',
    'PE': 'P/E Ratio',
    'ROA-1': 'ROA (%)',
    'ln_totalAssets-1': 'ln(Total Assets)',
    'ln_totalSales-1': 'ln(Total Sales)',
    'SG&A_Intensity': 'SG&A Intensity (%)',
    'financialLeverage-1': 'Financial Leverage',
    'delta_ln_S5INFT-1': r'$\Delta$ ln(S5INFT) (%)',
    'delta_ln_GDPWorld-1': r'$\Delta$ ln(GDP World) (%)',
    'R&D_Intensity': 'R&D Intensity (%)',
    'employee_smoothed_rating': 'Employee Rating',
    'Employee Satisfaction Sentiment': 'Employee Sentiment',
    'Customer Satisfaction Sentiment': 'Customer Sentiment',
    'Innovation Sentiment': 'Innovation Sentiment',
    'Intellectual Property Sentiment': 'IP Sentiment',
    'Brand Strength Sentiment': 'Brand Sentiment'
}
desc_stats.index = [label_map.get(var, var) for var in desc_stats.index]

# ✅ Show & export
from IPython.display import display
display(desc_stats)
desc_stats.to_csv("descriptive_statistics_with_labels.csv")

In [None]:
# STEP 1.5: Data Overview & Descriptive Statistics

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm  # for the normal pdf
import os

# ——— LaTeX‐style Matplotlib setup ———
plt.rcParams.update({
    'text.usetex': False,           # mathtext instead of full TeX
    'mathtext.fontset': 'stix',
    'font.family': 'serif',
    'figure.figsize': (6.0, 4.0),   # adjust to your \textwidth
    'axes.labelsize': 10,
    'axes.titlesize': 12,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9,
    'legend.fontsize': 9,
    'figure.autolayout': True,
})

# Make an output folder
outdir = "histograms_for_latex"
os.makedirs(outdir, exist_ok=True)

for var in numeric_df.columns:
    vals = numeric_df[var].dropna().values
    μ, σ = vals.mean(), vals.std()

    x = np.linspace(vals.min(), vals.max(), 200)
    pdf = norm.pdf(x, loc=μ, scale=σ)

    # create figure & axes
    fig, ax = plt.subplots()
    ax.hist(vals, bins='auto', density=True,
            edgecolor='black', alpha=0.6,
            label='Empirical')
    ax.plot(x, pdf, 'k--', linewidth=1.5,
            label=rf'Normal($\mu={μ:.2f},\,\sigma={σ:.2f}$)')

    ax.set_xlabel(label_map.get(var, var))
    ax.set_ylabel('Density')
    ax.set_title(rf'Histogram of {label_map.get(var, var)} (pooled)')
    ax.legend(loc='upper right')
    ax.grid(axis='y', alpha=0.3)

    # 1) Display inline
    plt.show()

    # 2) Save vector graphics
    safe_var = var.replace(" ", "_").replace("/", "")
    pdf_path = os.path.join(outdir, f"hist_{safe_var}.pdf")
    pgf_path = os.path.join(outdir, f"hist_{safe_var}.pgf")

    fig.savefig(pdf_path, format='pdf', dpi=300, bbox_inches='tight')

    plt.close(fig)


In [None]:
# STEP 2: Data Overview & Descriptive Statistics within firm

from scipy.stats import skew, kurtosis

# ✅ Re-select numeric variables (excluding dummy variables)
numeric_df = df.select_dtypes(include=['float64', 'int64']).drop(columns=['is_imputed', 'Pre_IPO'], errors='ignore')

# ✅ Compute within-firm skewness and kurtosis
within_skew = numeric_df.groupby('company').apply(lambda x: x.skew()).T
within_kurt = numeric_df.groupby('company').apply(lambda x: x.kurt()).T

# ✅ Mean skewness and kurtosis across firms
within_dist_stats = pd.DataFrame({
    "Mean Skewness (Within Firm)": within_skew.mean(axis=1).round(2),
    "Mean Kurtosis (Within Firm)": within_kurt.mean(axis=1).round(2)
})

# ✅ Apply LaTeX-friendly labels
within_dist_stats.index = [label_map.get(var, var) for var in within_dist_stats.index]

# ✅ Sort and display
within_dist_stats = within_dist_stats.sort_values(by="Mean Skewness (Within Firm)", ascending=False)
display(within_dist_stats)

# ✅ Export
within_dist_stats.to_csv("within_firm_skew_kurtosis_labeled.csv")


In [None]:
# STEP 3: Transformation of variables

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer

# --- Reload original panel data (or continue from your existing df) ---
file_path = '/content/drive/MyDrive/Data_to_analyze/Final_Dataset_no_NAs.csv'
df = pd.read_csv(file_path, sep=';')
df['year'] = df['year'].astype(int)
df.set_index(['company','year'], inplace=True)


# --- 1) PCR on sentiment variables ---
sent_cols = [
    'Employee Satisfaction Sentiment',
    'Customer Satisfaction Sentiment',
    'Innovation Sentiment',
    'Intellectual Property Sentiment',
    'Brand Strength Sentiment'
]

# Extract and standardize
sent = df[sent_cols].dropna()
scaler = StandardScaler()
sent_std = scaler.fit_transform(sent)

# Fit PCA and get first component
pca = PCA(n_components=1)
sent_pc1 = pca.fit_transform(sent_std).ravel()

# Create a new column and drop the originals
df.loc[sent.index, 'Sentiment_PCR'] = sent_pc1
df.drop(columns=sent_cols, inplace=True)

# ——— Yeo–Johnson transform for the PCR sentiment index ———
pt = PowerTransformer(method='yeo-johnson')

# Fit & apply to the existing Sentiment_PCR column
df['YJ(Sentiment_PCR)'] = pt.fit_transform(df[['Sentiment_PCR']])

# (Optional) drop the raw PCR if you no longer need it
df.drop(columns=['Sentiment_PCR'], inplace=True)

# --- 2) Log-transform P/E, TSR, Tobin’s Q etc. in place ---
for col in ['PE','TSR','Tobins_Q', 'financialLeverage-1']:
    # replace zeros with NaN to avoid -inf
    df[col] = np.log(df[col].replace(0, np.nan))

    # After taking logs of PE, TSR, and Tobins_Q
df.rename(columns={
    'PE': 'ln(PE)',
    'TSR': 'ln(TSR)',
    'Tobins_Q': 'ln(Tobin’s Q)',
    'financialLeverage-1': 'ln(Financial Leverage-1)'
}, inplace=True)

# --- 3) Save transformed dataset, replacing the old one ---
out_path = '/content/drive/MyDrive/Data_to_analyze/Final_Dataset_transformed.csv'
df.to_csv(out_path, sep=';')

print(f"Saved transformed panel to {out_path}")


In [None]:
# STEP 4: Data Overview & Descriptive Statistics within firm on transformed dataset

import pandas as pd
from scipy.stats import skew, kurtosis

# 1) Load the transformed dataset
file_path = '/content/drive/MyDrive/Data_to_analyze/Final_Dataset_transformed.csv'
df = pd.read_csv(file_path, sep=';')

# 2) Re-establish panel index
df['year'] = df['year'].astype(int)
df.set_index(['company', 'year'], inplace=True)

# 3) Select numeric variables (drop any leftover dummies)
exclude = ['is_imputed', 'Pre_IPO']
numeric_df = df.select_dtypes(include=['float64', 'int64']).drop(columns=exclude, errors='ignore')

# 4) Compute within-firm skewness and kurtosis
#    pandas .skew() and .kurt() both return Fisher (excess) skew/kurtosis by default
within_skew = numeric_df.groupby('company').apply(lambda x: x.skew()).T
within_kurt = numeric_df.groupby('company').apply(lambda x: x.kurt()).T


# 5) Mean skewness & kurtosis across firms
within_dist_stats = pd.DataFrame({
    "Mean Skewness (Within Firm)": within_skew.mean(axis=1).round(2),
    "Mean Excess Kurtosis (Within Firm)": within_kurt.mean(axis=1).round(2)
})

# 6) (Optional) Apply LaTeX-friendly labels if you have a label_map dict
# within_dist_stats.index = [label_map.get(var, var) for var in within_dist_stats.index]

# ✅ Sort and display
within_dist_stats = within_dist_stats.sort_values(by="Mean Skewness (Within Firm)", ascending=False)
display(within_dist_stats)

# 8) Export to CSV
within_dist_stats.to_csv("within_firm_skew_kurtosis_transformed.csv")
