<a href="https://colab.research.google.com/github/henryonomakpo/The-Impact-of-ESG-Ratings-on-EV-Manufacturing-Industry/blob/main/ESG_Monthly_to_Daily.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install yfinance statsmodels pandas numpy scikit-learn xlsxwriter linearmodels

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Collecting linearmodels
  Downloading linearmodels-6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting mypy-extensions>=0.4 (from linearmodels)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pyhdfe>=0.1 (from linearmodels)
  Downloading pyhdfe-0.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting formulaic>=1.0.0 (from linearmodels)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting setuptools-scm<9.0.0,>=8.0.0 (from setuptools-scm[toml]<9.0.0,>=8.0.0->linearmodels)
  Downloading setuptools_scm-8.3.1-py3-none-any.whl.metadata (7.0 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=1.0.0->linearmodels)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading xlsxwriter-3.2.5-py3-none-any.whl (172 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB

In [3]:
import pandas as pd
import numpy as np

# 1. Create the initial DataFrame with the annual data provided
data = {
    'Company': ['Wayfair', 'Carvana', 'Tripadvisor', 'Groupon', 'BigCommerce', 'Rakuten', 'Pinduoduo',
                'FNAC Darty', 'Global-E', 'VTEX', 'Coupang'],
    # Using the corrected ticker for FNAC Darty
    'Ticker': ['W', 'CVNA', 'TRIP', 'GRPN', 'BIGC', '4755.T', 'PDD', 'FNAC.PA', 'GLBE', 'VTEX', 'CPNG'],
    'Country': ['USA', 'USA', 'USA', 'USA', 'USA', 'Japan', 'China', 'France', 'Israel', 'Brazil', 'South Korea'],
    '2019': [25.5, 35.1, 24.8, 26.9, np.nan, 26.2, 32.4, np.nan, np.nan, np.nan, np.nan],
    '2020': [24.1, 34.2, 23.5, 26.1, np.nan, 25.0, 30.1, np.nan, np.nan, np.nan, np.nan],
    '2021': [22.9, 33.0, 22.1, 25.2, 21.5, 24.1, 28.0, np.nan, np.nan, np.nan, np.nan],
    '2022': [22.0, 31.8, 21.3, 24.5, 20.3, 23.3, 25.9, np.nan, np.nan, np.nan, np.nan],
    '2023': [21.5, 31.1, 20.8, 24.0, 19.4, 22.7, 24.5, np.nan, np.nan, np.nan, np.nan],
    '2024': [21.1, 30.2, 20.2, 23.6, 18.5, 22.1, 23.4, 12.4, 24.9, np.nan, 26.0],
    # 2025 will be forward-filled from 2024
    '2025': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
}
annual_df = pd.DataFrame(data)

# 2. Prepare for daily data generation
all_daily_data = []
date_range = pd.to_datetime(pd.date_range(start='2019-01-01', end='2025-12-31', freq='D'))

print("Generating daily ESG data for each company...")

# 3. Loop through each company to create daily data
for index, row in annual_df.iterrows():
    ticker = row['Ticker']
    company_name = row['Company']

    # Create a temporary dataframe with the daily date range
    daily_company_df = pd.DataFrame(index=date_range)
    daily_company_df['Ticker'] = ticker

    # --- FIX: Initialize the ESG_Score column with NaN ---
    # This ensures the column always exists, even if no scores are available for the initial years.
    daily_company_df['ESG_Score'] = np.nan

    # Place the annual ESG scores at the start of each year
    for year in range(2019, 2026):
        score = row.get(str(year))
        if pd.notna(score):
            # Set the score on January 1st of that year
            daily_company_df.loc[f'{year}-01-01', 'ESG_Score'] = score

    # Interpolate to create smooth daily values between years
    # and forward-fill to project the last known score into the future
    daily_company_df['ESG_Score'] = daily_company_df['ESG_Score'].interpolate(method='linear').ffill()

    # Backward-fill any initial NaNs (for companies with no early data)
    daily_company_df['ESG_Score'] = daily_company_df['ESG_Score'].bfill()

    all_daily_data.append(daily_company_df)

# 4. Combine all companies into a single DataFrame
final_daily_df = pd.concat(all_daily_data)
final_daily_df.reset_index(inplace=True)
final_daily_df.rename(columns={'index': 'Date'}, inplace=True)

# Reorder columns for clarity
final_daily_df = final_daily_df[['Date', 'Ticker', 'ESG_Score']]

# 5. Save the final dataset to an Excel file in Google Colab
output_filename = 'daily_ecommerce_esg_data.xlsx'
final_daily_df.to_excel(output_filename, index=False, engine='openpyxl')

print(f"\nSuccessfully generated daily ESG data for {len(annual_df)} companies.")
print(f"File '{output_filename}' has been saved to your Colab environment.")
print("\n--- First 5 rows of the generated daily dataset ---")
print(final_daily_df.head())
print("\n--- Last 5 rows of the generated daily dataset ---")
print(final_daily_df.tail())
print(f"\nTotal rows in dataset: {len(final_daily_df)}")

# You can download the file from the Colab file explorer on the left sidebar.

Generating daily ESG data for each company...

Successfully generated daily ESG data for 11 companies.
File 'daily_ecommerce_esg_data.xlsx' has been saved to your Colab environment.

--- First 5 rows of the generated daily dataset ---
        Date Ticker  ESG_Score
0 2019-01-01      W  25.500000
1 2019-01-02      W  25.496164
2 2019-01-03      W  25.492329
3 2019-01-04      W  25.488493
4 2019-01-05      W  25.484658

--- Last 5 rows of the generated daily dataset ---
            Date Ticker  ESG_Score
28122 2025-12-27   CPNG       26.0
28123 2025-12-28   CPNG       26.0
28124 2025-12-29   CPNG       26.0
28125 2025-12-30   CPNG       26.0
28126 2025-12-31   CPNG       26.0

Total rows in dataset: 28127


### Convert ESG dataset from Monthly to Daily

In [6]:
import pandas as pd
import numpy as np
import sys

# --- Configuration ---
# The path to your uploaded monthly ESG data file
INPUT_CSV_PATH = '/content/historic_esg_scores_ecommerce.csv'
# The name for the final daily dataset
OUTPUT_EXCEL_PATH = 'daily_ecommerce_esg_data_final.xlsx'

# --- Main Script ---
print(f"--- Step 1: Loading Monthly ESG Data from '{INPUT_CSV_PATH}' ---")

try:
    # --- FIX 1: Let pandas auto-detect the separator by removing sep='\t' ---
    # This is more robust than assuming it's a tab.
    # The 'engine='python'' argument helps with a wider variety of separators.
    monthly_df = pd.read_csv(INPUT_CSV_PATH, engine='python')

    # --- FIX 2: Clean up column names to remove leading/trailing spaces ---
    monthly_df.columns = monthly_df.columns.str.strip()

    # --- FIX 3: Check for the 'Date' column *after* loading ---
    if 'Date' not in monthly_df.columns:
        print("FATAL ERROR: 'Date' column not found after loading the CSV.")
        print("Please check the exact column name for dates in your file.")
        print(f"Columns found: {monthly_df.columns.tolist()}")
        sys.exit()

    monthly_df['Date'] = pd.to_datetime(monthly_df['Date'])
    print("Monthly data loaded successfully.")

except FileNotFoundError:
    print(f"FATAL ERROR: The file was not found at '{INPUT_CSV_PATH}'.")
    print("Please make sure you have uploaded the file to your Colab session.")
    sys.exit()
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    sys.exit()

# Rename columns to a consistent format
monthly_df.rename(columns={
    'ESG_Total_Score': 'ESG',
    'E-Score': 'E',
    'S-Score': 'S',
    'G-Score': 'G'
}, inplace=True)

# Ensure data types are correct for interpolation
score_cols = ['ESG', 'E', 'S', 'G']
for col in score_cols:
    monthly_df[col] = pd.to_numeric(monthly_df[col], errors='coerce')


print("\n--- Step 2: Upsampling Monthly Data to Daily Frequency ---")
all_daily_data = []
tickers = monthly_df['Ticker'].unique()

for ticker in tickers:
    # Isolate the data for one company
    company_df = monthly_df[monthly_df['Ticker'] == ticker].copy()

    # Set the 'Date' as the index, which is required for resampling
    company_df.set_index('Date', inplace=True)

    # Upsample to daily frequency and interpolate
    # 'interpolate(method='linear')' creates a smooth daily transition between monthly data points.
    daily_df = company_df.resample('D').interpolate(method='linear')

    # The ticker column gets lost during resampling, so we add it back
    daily_df['Ticker'] = ticker

    all_daily_data.append(daily_df)

# Combine the daily data for all companies into a single DataFrame
final_daily_df = pd.concat(all_daily_data)

# Forward-fill any remaining gaps, especially at the end of the series
final_daily_df.ffill(inplace=True)
# Backward-fill any remaining gaps at the beginning
final_daily_df.bfill(inplace=True)


# Reset the index to turn 'Date' back into a column
final_daily_df.reset_index(inplace=True)
final_daily_df.rename(columns={'index': 'Date'}, inplace=True)

# Reorder columns to the desired format
final_daily_df = final_daily_df[['Date', 'Ticker', 'ESG', 'E', 'S', 'G']]

print("\n--- Step 3: Saving Daily Data to Excel ---")
final_daily_df.to_excel(OUTPUT_EXCEL_PATH, index=False, engine='openpyxl')

print(f"\nSuccessfully generated daily ESG data for {len(tickers)} companies.")
print(f"File '{OUTPUT_EXCEL_PATH}' has been saved to your Colab environment.")
print("\n--- Data Sample (First 5 Rows) ---")
print(final_daily_df.head())
print("\n--- Data Sample (Last 5 Rows) ---")
print(final_daily_df.tail())
print(f"\nTotal rows in the new daily dataset: {len(final_daily_df):,}")

--- Step 1: Loading Monthly ESG Data from '/content/historic_esg_scores_ecommerce.csv' ---
Monthly data loaded successfully.

--- Step 2: Upsampling Monthly Data to Daily Frequency ---

--- Step 3: Saving Daily Data to Excel ---

Successfully generated daily ESG data for 12 companies.
File 'daily_ecommerce_esg_data_final.xlsx' has been saved to your Colab environment.

--- Data Sample (First 5 Rows) ---
        Date Ticker   ESG     E      S      G
0 2019-01-01   AMZN  30.6  5.85  13.74  11.01
1 2019-01-02   AMZN  30.6  5.85  13.74  11.01
2 2019-01-03   AMZN  30.6  5.85  13.74  11.01
3 2019-01-04   AMZN  30.6  5.85  13.74  11.01
4 2019-01-05   AMZN  30.6  5.85  13.74  11.01

--- Data Sample (Last 5 Rows) ---
            Date  Ticker    ESG     E     S     G
22217 2025-03-28  4755.T  20.93  2.83  9.48  8.62
22218 2025-03-29  4755.T  20.93  2.83  9.48  8.62
22219 2025-03-30  4755.T  20.93  2.83  9.48  8.62
22220 2025-03-31  4755.T  20.93  2.83  9.48  8.62
22221 2025-04-01  4755.T  20.93 

### Merge the two daily ESG datasets

In [20]:
import pandas as pd
import numpy as np
import sys
import re

# --- Configuration ---
# Input file paths
NEW_DAILY_FILE = '/content/daily_ecommerce_esg_data.csv'
ORIGINAL_MONTHLY_FILE = '/content/historic_esg_scores_ecommerce.csv'

# Output file name
MERGED_OUTPUT_FILE = 'merged_daily_esg_data_final.xlsx'

# --- Define the standard column names we will use everywhere ---
CANONICAL_COLUMNS = ['Date', 'ESG_Total_Score', 'E_Score', 'S_Score', 'G_Score', 'Ticker']

# --- NEW: A Highly Robust Parser for the Historic File ---
def load_and_parse_historic_file(filepath):
    """
    Attempts to load the historic ESG data file using multiple common parsing strategies.
    Returns a pandas DataFrame on success, or an empty DataFrame on failure.
    """
    # --- Strategy 1: Try reading as a standard comma-separated CSV ---
    try:
        df = pd.read_csv(filepath, sep=',')
        if df.shape[1] > 2:
            print("✅ Successfully parsed historic file using COMMA delimiter.")
            return df
    except Exception:
        print("⚠️ Comma-separated parsing failed. Trying next method...")

    # --- Strategy 2: Try reading with whitespace as the delimiter ---
    try:
        df = pd.read_csv(filepath, delim_whitespace=True)
        if df.shape[1] > 2:
            print("✅ Successfully parsed historic file using WHITESPACE delimiter.")
            return df
    except Exception:
        print("⚠️ Whitespace parsing failed. Trying next method...")

    # --- Strategy 3: Manual line-by-line parsing with Regex (most robust) ---
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()
        data_rows = []
        pattern = re.compile(r"(\S+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+(\S+)")
        for line in lines[1:]: # Skip header
            match = pattern.search(line)
            if match:
                data_rows.append(list(match.groups()))
        if data_rows:
            df = pd.DataFrame(data_rows, columns=['Date', 'ESG_Total_Score', 'E_Score', 'S_Score', 'G_Score', 'Ticker'])
            print("✅ Successfully parsed historic file using MANUAL line-by-line Regex method.")
            return df
    except Exception as e:
        print(f"⚠️ Manual parsing failed: {e}")

    print("❌ All parsing methods failed for the historic file.")
    return pd.DataFrame()

# --- Reusable Function to Upsample to Daily ---
def convert_to_daily(df):
    if df.empty:
        return pd.DataFrame()

    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df.dropna(subset=['Date'], inplace=True)

    score_cols = ['ESG_Total_Score', 'E_Score', 'S_Score', 'G_Score']
    for col in score_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    all_daily_data = []
    for ticker in df['Ticker'].unique():
        company_df = df[df['Ticker'] == ticker].copy().drop_duplicates(subset=['Date']).set_index('Date')
        daily_index = pd.date_range(start=df['Date'].min(), end='2025-12-31', freq='D')
        daily_df = company_df.reindex(daily_index)
        daily_df[score_cols] = daily_df[score_cols].interpolate(method='linear')
        daily_df['Ticker'] = ticker

        # --- THE FIX IS HERE ---
        # The chained call was causing the error because ffill(inplace=True) returns None.
        # Perform the operations on separate lines.
        daily_df.ffill(inplace=True)
        daily_df.bfill(inplace=True)
        # --- END OF FIX ---

        all_daily_data.append(daily_df)

    return pd.concat(all_daily_data).reset_index().rename(columns={'index': 'Date'})


# --- Main Script ---

# 1. Load the primary daily data file
print(f"--- Step 1: Processing primary file '{NEW_DAILY_FILE}' ---")
df1 = pd.DataFrame()
try:
    df1 = pd.read_csv(NEW_DAILY_FILE)
    df1['Date'] = pd.to_datetime(df1['Date'])
    print(f"Loaded {len(df1)} rows from primary file.")
except FileNotFoundError:
    print(f"Info: Primary file not found: {NEW_DAILY_FILE}. Proceeding without it.")
except Exception as e:
    print(f"Error processing {NEW_DAILY_FILE}: {e}")

# 2. Load and process the original messy monthly file using the robust parser
print(f"\n--- Step 2: Processing historic file '{ORIGINAL_MONTHLY_FILE}' ---")
df2_monthly = load_and_parse_historic_file(ORIGINAL_MONTHLY_FILE)

# Standardize column names for BOTH dataframes to ensure a perfect merge
rename_map = {
    'ESG': 'ESG_Total_Score', 'E': 'E_Score', 'S': 'S_Score', 'G': 'G_Score',
    'E-Score': 'E_Score', 'S-Score': 'S_Score', 'G-Score': 'G_Score'
}
if not df1.empty:
    df1.rename(columns=rename_map, inplace=True)
if not df2_monthly.empty:
    df2_monthly.rename(columns=rename_map, inplace=True)

# Convert the now-clean monthly data to daily
df2_daily = convert_to_daily(df2_monthly)
if not df2_daily.empty:
    print(f"Upsampled historic data to {len(df2_daily)} daily rows.")

# 3. Combine the two processed daily datasets
print("\n--- Step 3: Merging and Finalizing Data ---")
final_df = pd.concat([df1, df2_daily], ignore_index=True)
print(f"Total rows before deduplication: {len(final_df):,}")

# 4. De-duplicate and clean the final dataset
if not final_df.empty:
    # Coerce score columns to numeric, turning errors into NaNs
    for col in ['ESG_Total_Score', 'E_Score', 'S_Score', 'G_Score']:
        if col in final_df.columns:
            final_df[col] = pd.to_numeric(final_df[col], errors='coerce')

    final_df.sort_values(by=['Ticker', 'Date'], inplace=True)
    final_df.drop_duplicates(subset=['Date', 'Ticker'], keep='first', inplace=True)

    # Reorder DataFrame to the canonical format, keeping only existing columns
    final_df = final_df.reindex(columns=[col for col in CANONICAL_COLUMNS if col in final_df.columns])

    print(f"Total rows after deduplication: {len(final_df):,}")
    print(f"Final data shape: {final_df.shape} (rows, columns)")

    # 5. Save the final dataset
    final_df.to_excel(MERGED_OUTPUT_FILE, index=False, engine='openpyxl')

    print(f"\n✅ Successfully merged and saved the final dataset.")
    print(f"File '{MERGED_OUTPUT_FILE}' is now available in your Colab environment.")
    print("\n--- Final Merged Data Sample (Head) ---")
    print(final_df.head())
    print("\n--- Final Merged Data Sample (Tail) ---")
    print(final_df.tail())
else:
    print("\nNo data was processed. No output file was created.")

--- Step 1: Processing primary file '/content/daily_ecommerce_esg_data.csv' ---
Loaded 28127 rows from primary file.

--- Step 2: Processing historic file '/content/historic_esg_scores_ecommerce.csv' ---
✅ Successfully parsed historic file using COMMA delimiter.
Upsampled historic data to 49680 daily rows.

--- Step 3: Merging and Finalizing Data ---
Total rows before deduplication: 77,807
Total rows after deduplication: 72,693
Final data shape: (72693, 6) (rows, columns)

✅ Successfully merged and saved the final dataset.
File 'merged_daily_esg_data_final.xlsx' is now available in your Colab environment.

--- Final Merged Data Sample (Head) ---
            Date  ESG_Total_Score  E_Score  S_Score  G_Score  Ticker
73667 2014-09-01             45.0     38.0     46.0     51.0  4755.T
73668 2014-09-02             45.0     38.0     46.0     51.0  4755.T
73669 2014-09-03             45.0     38.0     46.0     51.0  4755.T
73670 2014-09-04             45.0     38.0     46.0     51.0  4755.T
7

#### Code to convert Renewable Energy firms ESG data from yearly to daily

In [1]:
import pandas as pd
import numpy as np
import io
import sys

# --- Configuration ---
OUTPUT_EXCEL_PATH = 'daily_renewable_energy_esg.xlsx'
START_YEAR = 2019
END_YEAR = 2025

# --- Data Provided as a String ---
data_string = """
Company Name	Ticker(s)	2019	2020	2021	2022	2023	2024	2025
NextEra Energy	NEE	25.5 | A	24.1 | AA	23.0 | AA	22.3 | AA	21.8 | AA	25	25
Iberdrola	IBE.MC	18.9 | AA	18.1 | AAA	17.5 | AAA	16.9 | AAA	16.5 | AAA	23	23
Enel	ENEL.MI	20.1 | AA	19.5 | AA	18.8 | AA	18.2 | AA	17.9 | AA	22.5	22.5
Ørsted	ORSTED.CO	17.5 | AAA	16.8 | AAA	15.9 | AAA	15.2 | AAA	14.9 | AAA	20	20
Brookfield Renewable	BEP	22.3 | A	21.0 | A	20.1 | AA	19.4 | AA	18.8 | AA	15.8	15.8
Vestas Wind Systems	VWS.CO	19.8 | A	19.1 | AA	18.5 | AA	17.9 | AA	17.4 | AA	16.3	16.3
Siemens Energy	ENR.DE	N/A	28.5 | BB	27.2 | A	26.1 | A	25.3 | A	14.2	14.2
JinkoSolar	JKS	28.9 | BB	27.5 | BB	25.4 | BBB	24.0 | BBB	23.1 | BBB	27.1	27.1
Canadian Solar	CSIQ	29.1 | BB	28.0 | BB	26.5 | BBB	25.0 | BBB	24.1 | BBB	28.5	28.5
First Solar	FSLR	18.8 | A	17.9 | AA	17.0 | AA	16.3 | AA	15.8 | AA	17.3	17.3
SunPower	SPWRQ	25.8 | BB	24.9 | BB	23.5 | BB	22.4 | BB	21.5 | BB	26.4	26.4
SolarEdge Technologies	SEDG	24.2 | A	23.1 | A	22.0 | A	20.9 | A	19.9 | A	15.3	15.3
EDP Renováveis	EDPR.LS	19.5 | AA	18.8 | AA	18.1 | AA	17.4 | AA	17.0 | AA	14.6	14.6
Engie	ENGI.PA	21.2 | A	20.5 | AA	19.7 | AA	19.0 | AA	18.5 | AA	30.2	30.2
Acciona Energía	ANE.MC	N/A	N/A	16.5 | AA	15.8 | AA	15.0 | AA	9.4	9.4
Ormat Technologies	ORA	24.8 | BB	23.9 | A	22.9 | A	21.8 | A	21.1 | A	17.8	17.8
Verbund AG	VER.VI	17.9 | AA	17.1 | AA	16.5 | AA	15.8 | AA	15.4 | AA	18.7	18.7
Scatec ASA	SCATC.OL	23.5 | BB	22.1 | A	21.0 | A	19.8 | A	19.2 | A	11.2	11.2
GE Vernova	GEV	N/A	N/A	N/A	N/A	22.5 | A	21.8	21.8
Constellation Energy	CEG	N/A	N/A	N/A	19.5 | AA	18.2 | AA	28.8	28.8
"""

# --- Helper Function to Extract Numeric Score ---
def extract_numeric_score(value):
    """
    Cleans cells like '25.5 | A' to extract the numeric part.
    Returns np.nan if the value is not processable.
    """
    if isinstance(value, str):
        # Replace 'N/A' with NaN
        if value.strip().upper() == 'N/A':
            return np.nan
        # Split by '|' and take the first part
        return pd.to_numeric(value.split('|')[0].strip(), errors='coerce')
    # If it's already a number (or NaN), just return it
    return value

# --- Reusable Function to Convert to Daily ---
def convert_to_daily(df):
    """
    Upsamples a DataFrame with annual ESG scores to a daily frequency
    using linear interpolation.
    """
    all_daily_data = []
    tickers = df['Ticker(s)'].unique()

    # Define a consistent date range for all tickers
    date_index = pd.date_range(start=f'{START_YEAR}-01-01', end=f'{END_YEAR}-12-31', freq='D')

    for ticker in tickers:
        company_df = df[df['Ticker(s)'] == ticker].copy()

        # Create a dictionary of dates and scores for this company
        score_points = {}
        for year_str in [str(y) for y in range(START_YEAR, END_YEAR + 1)]:
            if year_str in company_df.columns:
                score = company_df[year_str].iloc[0]
                if pd.notna(score):
                    score_points[pd.to_datetime(f'{year_str}-01-01')] = score

        # Create a Series from the score points
        if not score_points:
            continue # Skip if no data for this ticker

        score_series = pd.Series(score_points)

        # Reindex to the full daily range, interpolate, and fill
        daily_scores = score_series.reindex(date_index).interpolate(method='linear').ffill().bfill()

        # Create the final daily DataFrame for this ticker
        daily_df = pd.DataFrame(daily_scores, columns=['ESG_Score'])
        daily_df['Ticker'] = ticker

        all_daily_data.append(daily_df)

    if not all_daily_data:
        return pd.DataFrame()

    return pd.concat(all_daily_data).reset_index().rename(columns={'index': 'Date'})

# --- Main Script ---

# 1. Load data from the string into a DataFrame
print("--- Step 1: Loading and Cleaning Annual Data ---")
annual_df = pd.read_csv(io.StringIO(data_string), sep='\t')
print(f"Loaded {len(annual_df)} companies.")

# 2. Extract numeric scores from all year columns
year_cols = [str(y) for y in range(START_YEAR, END_YEAR + 1)]
for col in year_cols:
    if col in annual_df.columns:
        annual_df[col] = annual_df[col].apply(extract_numeric_score)
print("Cleaned and extracted numeric scores.")

# 3. Convert the cleaned annual data to a daily format
print("\n--- Step 2: Converting Annual Data to Daily Frequency ---")
final_daily_df = convert_to_daily(annual_df)

if final_daily_df.empty:
    print("FATAL ERROR: No daily data could be generated.")
    sys.exit()

# 4. Final formatting and saving
print("\n--- Step 3: Saving Daily Data to Excel ---")
# Ensure the final column order is correct
final_daily_df = final_daily_df[['Date', 'Ticker', 'ESG_Score']]

final_daily_df.to_excel(OUTPUT_EXCEL_PATH, index=False, engine='openpyxl')

print(f"\nSuccessfully generated daily ESG data.")
print(f"File '{OUTPUT_EXCEL_PATH}' has been saved to your Colab environment.")
print("\n--- Data Sample (First 5 Rows) ---")
print(final_daily_df.head())
print("\n--- Data Sample (Last 5 Rows) ---")
print(final_daily_df.tail())
print(f"\nTotal rows in the new daily dataset: {len(final_daily_df):,}")

--- Step 1: Loading and Cleaning Annual Data ---
Loaded 20 companies.
Cleaned and extracted numeric scores.

--- Step 2: Converting Annual Data to Daily Frequency ---

--- Step 3: Saving Daily Data to Excel ---

Successfully generated daily ESG data.
File 'daily_renewable_energy_esg.xlsx' has been saved to your Colab environment.

--- Data Sample (First 5 Rows) ---
        Date Ticker  ESG_Score
0 2019-01-01    NEE  25.500000
1 2019-01-02    NEE  25.496164
2 2019-01-03    NEE  25.492329
3 2019-01-04    NEE  25.488493
4 2019-01-05    NEE  25.484658

--- Data Sample (Last 5 Rows) ---
            Date Ticker  ESG_Score
51135 2025-12-27    CEG       28.8
51136 2025-12-28    CEG       28.8
51137 2025-12-29    CEG       28.8
51138 2025-12-30    CEG       28.8
51139 2025-12-31    CEG       28.8

Total rows in the new daily dataset: 51,140


#### Code to convert yearly Transport ESG to daily ESG dataset

In [3]:
import pandas as pd
import numpy as np
import sys

# --- Configuration ---
# Define the input file path in your Google Colab environment
INPUT_FILE_PATH = '/content/Transport_esg_dataset.csv'
OUTPUT_FILE_PATH = '/content/Transport_esg_daily_dataset.xlsx'

print(f"--- Starting Yearly to Daily Data Conversion ---")
print(f"Input file: {INPUT_FILE_PATH}")

# --- Step 1: Load and Clean the Dataset ---
try:
    # Use delim_whitespace=True to handle variable spaces between columns robustly
    df = pd.read_csv(INPUT_FILE_PATH)
    print(f"✅ Successfully loaded {len(df)} rows and {len(df.columns)} columns.")
except FileNotFoundError:
    print(f"❌ FATAL ERROR: The file was not found at '{INPUT_FILE_PATH}'.")
    print("Please make sure you have uploaded the file to your Colab session.")
    sys.exit()
except Exception as e:
    print(f"❌ FATAL ERROR: An unexpected error occurred while loading the file: {e}")
    sys.exit()

# Clean and standardize column names (e.g., 'Identifier (RIC)' -> 'identifier_ric')
df.columns = [
    col.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
    for col in df.columns
]

# --- Step 2: Prepare Data for Time-Series Conversion ---
print("\n--- Step 2: Preparing data for time-series conversion ---")
# Convert the 'date' column (which is currently just a year) to a proper datetime object
# We anchor each year's data to the last day of that year for proper interpolation
df['date'] = pd.to_datetime(df['date'], format='%Y') + pd.offsets.YearEnd(0)

# Identify which columns are numeric and which are not, for separate handling later
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
# We know 'date' is a datetime object, and the rest are text/objects
object_cols = df.select_dtypes(include=['object']).columns.tolist()

# Ensure the identifier column is treated as an object/string
if 'identifier_ric' in numeric_cols:
    numeric_cols.remove('identifier_ric')
    object_cols.append('identifier_ric')

print(f"Identified {len(numeric_cols)} numeric columns to interpolate.")
print(f"Identified {len(object_cols)} object columns to forward-fill.")

# --- Step 3: Upsample to Daily Frequency for Each Company ---
print("\n--- Step 3: Interpolating yearly data to a daily timeline ---")
all_daily_dfs = []
# Group by the company identifier to process each company's timeline separately
for identifier, group in df.groupby('identifier_ric'):
    # Prepare the yearly data by setting the date as the index
    yearly_data = group.set_index('date').sort_index()

    if yearly_data.empty:
        continue # Skip if there's no data for this group

    # Create a complete daily index from the first to the last date in the dataset
    start_date = yearly_data.index.min()
    end_date = yearly_data.index.max()
    daily_index = pd.date_range(start=start_date, end=end_date, freq='D')

    # Reindex the sparse yearly data onto the full daily index, creating NaNs for new days
    daily_df = yearly_data.reindex(daily_index)

    # --- Interpolate Numeric Columns and Forward-Fill Object Columns ---
    # This is the core logic: smooth transitions for numbers, constant values for text
    daily_df[numeric_cols] = daily_df[numeric_cols].interpolate(method='linear')
    daily_df[object_cols] = daily_df[object_cols].ffill()

    # Back-fill to handle any NaNs at the very beginning of the series
    daily_df.bfill(inplace=True)

    all_daily_dfs.append(daily_df)

# Combine the processed daily data for all companies into a single DataFrame
final_df = pd.concat(all_daily_dfs).reset_index()
final_df.rename(columns={'index': 'date'}, inplace=True)
print(f"Daily interpolation complete. Generated {len(final_df):,} total data points.")


# --- Step 4: Finalize and Save to Excel ---
print(f"\n--- Step 4: Saving final dataset to '{OUTPUT_FILE_PATH}' ---")
# Ensure the final DataFrame has the same columns in a logical order
final_df = final_df[df.columns]

# Save the final, clean DataFrame to an Excel file
final_df.to_excel(OUTPUT_FILE_PATH, index=False, engine='openpyxl')

print(f"\n✅ Success! The daily dataset has been created and saved.")
print("You can find the file in the file browser on the left side of your Colab window.")

# Display a sample of the final data for verification
print("\n--- Sample of the Final Daily Data ---")
# Show data for a specific ticker to see the interpolation in action
sample_ticker = final_df['identifier_ric'].iloc[0]
print(final_df[final_df['identifier_ric'] == sample_ticker].head())
print("...")
print(final_df[final_df['identifier_ric'] == sample_ticker].tail())

--- Starting Yearly to Daily Data Conversion ---
Input file: /content/Transport_esg_dataset.csv
✅ Successfully loaded 140 rows and 35 columns.

--- Step 2: Preparing data for time-series conversion ---
Identified 31 numeric columns to interpolate.
Identified 3 object columns to forward-fill.

--- Step 3: Interpolating yearly data to a daily timeline ---
Daily interpolation complete. Generated 46,032 total data points.

--- Step 4: Saving final dataset to '/content/Transport_esg_daily_dataset.xlsx' ---

✅ Success! The daily dataset has been created and saved.
You can find the file in the file browser on the left side of your Colab window.

--- Sample of the Final Daily Data ---
  identifier_ric                 company_name       date  esg_score  \
0            AAL  American Airlines Group Inc 2012-12-31  51.250000   
1            AAL  American Airlines Group Inc 2013-01-01  51.237945   
2            AAL  American Airlines Group Inc 2013-01-02  51.225890   
3            AAL  American Air

#### Pharma ESG from yearly to daily

In [4]:
import pandas as pd
import numpy as np
import io
import sys

# --- Configuration ---
OUTPUT_EXCEL_PATH = 'daily_health_esg_data.xlsx'
START_YEAR = 2019
END_YEAR = 2025

# --- Data Provided as a String ---
data_string = """
Company Name	Ticker(s)	2019	2020	2021	2022	2023	2024	2025
UnitedHealth Group	UNH	22.5 | A	21.8 | AA	20.9 | AA	19.8 | AA	18.9 | AA	16.9	16.9
Johnson & Johnson	JNJ	26.8 | A	25.5 | A	24.1 | AA	23.0 | AA	22.2 | AA	25.5	25.5
Pfizer	PFE	32.1 | BBB	31.0 | A	29.8 | A	28.5 | A	27.9 | A	18.6	18.6
Eli Lilly & Co.	LLY	28.5 | BBB	27.4 | A	26.2 | A	25.1 | A	24.3 | A	23.1	23.1
Thermo Fisher	TMO	20.5 | A	19.8 | AA	18.9 | AA	18.0 | AA	17.5 | AA	11.9	11.9
Abbott Laboratories	ABT	18.9 | AA	18.1 | AA	17.3 | AA	16.5 | AA	16.0 | AA	20.4	20.4
Roche Holding AG	RHHBY	24.2 | AA	23.5 | AA	22.6 | AA	21.5 | AA	20.9 | AA	24	24
Novartis	NVS	25.5 | A	24.8 | AA	23.9 | AA	22.8 | AA	22.2 | AA	15.6	15.6
Merck & Co.	MRK	29.1 | BBB	28.2 | A	27.0 | A	25.9 | A	25.1 | A	18.7	18.7
AbbVie	ABBV	33.5 | B	32.1 | BB	30.8 | BBB	29.7 | BBB	29.0 | BBB	24.2	24.2
Novo Nordisk	NVO	17.9 | AAA	17.1 | AAA	16.3 | AAA	15.5 | AAA	15.0 | AAA	23.2	23.2
Amgen	AMGN	30.2 | BB	29.4 | A	28.1 | A	27.0 | A	26.5 | A	22.5	22.5
Stryker	SYK	23.8 | A	23.0 | AA	22.1 | AA	21.2 | AA	20.5 | AA	21.2	21.2
Boston Scientific	BSX	22.1 | A	21.4 | AA	20.6 | AA	19.8 | AA	19.2 | AA	18.6	18.6
AstraZeneca	AZN	22.8 | AA	22.0 | AA	21.1 | AA	20.4 | AA	19.9 | AA	23.1	23.1
Intuitive Surgical	ISRG	18.5 | AA	17.8 | AA	16.9 | AA	16.1 | AA	15.5 | AA	17.9	17.9
"""

# --- Helper Function to Extract Numeric Score ---
def extract_numeric_score(value):
    """
    Cleans cells like '25.5 | A' to extract the numeric part.
    Returns np.nan if the value is not processable.
    """
    if isinstance(value, str):
        if value.strip().upper() == 'N/A':
            return np.nan
        # Split by '|' and take the first part, then convert to number
        return pd.to_numeric(value.split('|')[0].strip(), errors='coerce')
    # If it's already a number (or NaN), return it as is
    return value

# --- Reusable Function to Convert to Daily ---
def convert_to_daily(df):
    """
    Upsamples a DataFrame with annual ESG scores to a daily frequency
    using linear interpolation.
    """
    all_daily_data = []
    tickers = df['Ticker(s)'].unique()

    # Define a consistent date range for all companies
    date_index = pd.date_range(start=f'{START_YEAR}-01-01', end=f'{END_YEAR}-12-31', freq='D')

    for ticker in tickers:
        company_df = df[df['Ticker(s)'] == ticker].copy()

        # Create a dictionary of dates and scores for this company
        score_points = {}
        for year_str in [str(y) for y in range(START_YEAR, END_YEAR + 1)]:
            if year_str in company_df.columns:
                score = company_df[year_str].iloc[0]
                if pd.notna(score):
                    score_points[pd.to_datetime(f'{year_str}-01-01')] = score

        if not score_points:
            print(f"Warning: No valid scores found for ticker {ticker}. Skipping.")
            continue

        score_series = pd.Series(score_points)

        # Reindex to the full daily range, interpolate, and fill gaps
        daily_scores = score_series.reindex(date_index).interpolate(method='linear').ffill().bfill()

        # Create the final daily DataFrame for this ticker
        daily_df = pd.DataFrame(daily_scores, columns=['ESG_Score'])
        daily_df['Ticker'] = ticker

        all_daily_data.append(daily_df)

    if not all_daily_data:
        return pd.DataFrame()

    return pd.concat(all_daily_data).reset_index().rename(columns={'index': 'Date'})

# --- Main Script ---

# 1. Load data from the string into a DataFrame
print("--- Step 1: Loading and Cleaning Annual Data ---")
annual_df = pd.read_csv(io.StringIO(data_string), sep='\t')
# A quick fix for Novartis ticker which may be read incorrectly
annual_df.replace({'NOT.DE': 'NVS'}, inplace=True)
print(f"Loaded {len(annual_df)} companies.")

# 2. Extract numeric scores from all year columns
year_cols = [str(y) for y in range(START_YEAR, END_YEAR + 1)]
for col in year_cols:
    if col in annual_df.columns:
        annual_df[col] = annual_df[col].apply(extract_numeric_score)
print("Cleaned and extracted numeric scores.")

# 3. Convert the cleaned annual data to a daily format
print("\n--- Step 2: Converting Annual Data to Daily Frequency ---")
final_daily_df = convert_to_daily(annual_df)

if final_daily_df.empty:
    print("FATAL ERROR: No daily data could be generated.")
    sys.exit()

# 4. Final formatting and saving
print("\n--- Step 3: Saving Daily Data to Excel ---")
# Ensure the final column order is correct
final_daily_df = final_daily_df[['Date', 'Ticker', 'ESG_Score']]

final_daily_df.to_excel(OUTPUT_EXCEL_PATH, index=False, engine='openpyxl')

print(f"\nSuccessfully generated daily ESG data.")
print(f"File '{OUTPUT_EXCEL_PATH}' has been saved to your Colab environment.")
print("\n--- Data Sample (First 5 Rows) ---")
print(final_daily_df.head())
print("\n--- Data Sample (Last 5 Rows) ---")
print(final_daily_df.tail())
print(f"\nTotal rows in the new daily dataset: {len(final_daily_df):,}")

--- Step 1: Loading and Cleaning Annual Data ---
Loaded 16 companies.
Cleaned and extracted numeric scores.

--- Step 2: Converting Annual Data to Daily Frequency ---

--- Step 3: Saving Daily Data to Excel ---

Successfully generated daily ESG data.
File 'daily_health_esg_data.xlsx' has been saved to your Colab environment.

--- Data Sample (First 5 Rows) ---
        Date Ticker  ESG_Score
0 2019-01-01    UNH  22.500000
1 2019-01-02    UNH  22.498082
2 2019-01-03    UNH  22.496164
3 2019-01-04    UNH  22.494247
4 2019-01-05    UNH  22.492329

--- Data Sample (Last 5 Rows) ---
            Date Ticker  ESG_Score
40907 2025-12-27   ISRG       17.9
40908 2025-12-28   ISRG       17.9
40909 2025-12-29   ISRG       17.9
40910 2025-12-30   ISRG       17.9
40911 2025-12-31   ISRG       17.9

Total rows in the new daily dataset: 40,912


### code to convert monthly to yearly Pharma ESG dataset

In [10]:
import pandas as pd
import numpy as np
import sys

# --- Configuration ---
INPUT_CSV_PATH = '/content/historic_esg_scores_pharma_healthcare.csv'
OUTPUT_EXCEL_PATH = 'daily_pharma_healthcare_esg.xlsx'

# --- Main Script ---

print(f"--- Step 1: Loading Monthly ESG Data from '{INPUT_CSV_PATH}' ---")

try:
    # --- FINAL, CORRECT FIX: The separator is a comma. ---
    monthly_df = pd.read_csv(INPUT_CSV_PATH, sep=',')

    # Clean column names to remove potential leading/trailing spaces
    monthly_df.columns = monthly_df.columns.str.strip()

    if 'Date' not in monthly_df.columns:
        print("FATAL ERROR: 'Date' column not found. Please check the CSV header.")
        sys.exit()

    monthly_df['Date'] = pd.to_datetime(monthly_df['Date'])
    print(f"Successfully loaded and parsed {len(monthly_df)} monthly records.")

except FileNotFoundError:
    print(f"FATAL ERROR: The file was not found at '{INPUT_CSV_PATH}'.")
    print("Please make sure you have uploaded the file to your Colab session.")
    sys.exit()
except Exception as e:
    print(f"An error occurred while reading the CSV: {e}")
    sys.exit()

# --- Step 2: Clean and Prepare Data ---

# Rename columns to a consistent, code-friendly format
monthly_df.rename(columns={
    'Total-Score': 'ESG',
    'E-Score': 'E',
    'S-Score': 'S',
    'G-Score': 'G'
}, inplace=True)

# Ensure all score columns are numeric for accurate interpolation
score_cols = ['ESG', 'E', 'S', 'G']
for col in score_cols:
    if col in monthly_df.columns:
        monthly_df[col] = pd.to_numeric(monthly_df[col], errors='coerce')

print("Data cleaned and prepared for daily conversion.")

# --- Step 3: Upsample Monthly Data to Daily Frequency ---

print("\n--- Step 3: Upsampling data to a daily frequency ---")

all_daily_data = []
tickers = monthly_df['Ticker'].unique()

# Define a consistent date range for all companies
start_date = monthly_df['Date'].min()
end_date = pd.to_datetime('2025-12-31') # Or your desired end date
daily_index = pd.date_range(start=start_date, end=end_date, freq='D')

for ticker in tickers:
    # Isolate the data for one company and set Date as the index
    company_df_monthly = monthly_df[monthly_df['Ticker'] == ticker].copy()
    company_df_monthly = company_df_monthly.drop_duplicates(subset=['Date']).set_index('Date')

    # Reindex to the full daily range, creating NaNs for all missing days
    daily_df = company_df_monthly.reindex(daily_index)

    # Interpolate all numeric columns to fill the daily gaps
    numeric_cols = daily_df.select_dtypes(include=np.number).columns
    daily_df[numeric_cols] = daily_df[numeric_cols].interpolate(method='linear')

    # Forward-fill and backward-fill to handle edges and non-numeric data
    daily_df.ffill(inplace=True)
    daily_df.bfill(inplace=True)

    # The ticker column was dropped during reindexing, add it back
    daily_df['Ticker'] = ticker

    all_daily_data.append(daily_df)

# Combine all tickers into one large DataFrame
final_daily_df = pd.concat(all_daily_data).reset_index().rename(columns={'index': 'Date'})

print("Daily interpolation complete.")

# --- Step 4: Save the Daily Dataset to Excel ---

# Reorder columns for clarity and consistency
final_columns_ordered = ['Date', 'Ticker', 'ESG', 'E', 'S', 'G']
final_daily_df = final_daily_df[[col for col in final_columns_ordered if col in final_daily_df.columns]]

# Save to Excel
final_daily_df.to_excel(OUTPUT_EXCEL_PATH, index=False, engine='openpyxl')

print(f"\nSuccessfully generated the daily dataset.")
print(f"File '{OUTPUT_EXCEL_PATH}' has been saved to your Colab environment.")
print("\n--- Daily Data Sample (First 5 Rows) ---")
print(final_daily_df.head())
print("\n--- Daily Data Sample (Last 5 Rows) ---")
print(final_daily_df.tail())
print(f"\nTotal rows in the new daily dataset: {len(final_daily_df):,}")

--- Step 1: Loading Monthly ESG Data from '/content/historic_esg_scores_pharma_healthcare.csv' ---
Successfully loaded and parsed 768 monthly records.
Data cleaned and prepared for daily conversion.

--- Step 3: Upsampling data to a daily frequency ---


  monthly_df['Date'] = pd.to_datetime(monthly_df['Date'])


Daily interpolation complete.

Successfully generated the daily dataset.
File 'daily_pharma_healthcare_esg.xlsx' has been saved to your Colab environment.

--- Daily Data Sample (First 5 Rows) ---
        Date Ticker        ESG          E     S     G
0 2014-09-01   ABBV  64.000000  66.000000  60.0  69.0
1 2014-09-02   ABBV  63.933333  66.033333  59.9  69.0
2 2014-09-03   ABBV  63.866667  66.066667  59.8  69.0
3 2014-09-04   ABBV  63.800000  66.100000  59.7  69.0
4 2014-09-05   ABBV  63.733333  66.133333  59.6  69.0

--- Daily Data Sample (Last 5 Rows) ---
            Date Ticker    ESG     E     S     G
24835 2025-12-27    PFE  18.29  2.74  9.18  6.37
24836 2025-12-28    PFE  18.29  2.74  9.18  6.37
24837 2025-12-29    PFE  18.29  2.74  9.18  6.37
24838 2025-12-30    PFE  18.29  2.74  9.18  6.37
24839 2025-12-31    PFE  18.29  2.74  9.18  6.37

Total rows in the new daily dataset: 24,840


#### Merge Health Pharma to daily dataset

In [11]:
import pandas as pd
import numpy as np
import sys

# --- Configuration ---
# Input file paths (ensure these are uploaded to your Colab session)
PRIMARY_FILE_PATH = '/content/daily_pharma_healthcare_esg.csv' # This file has priority
SUPPLEMENTARY_FILE_PATH = '/content/daily_health_esg_data.csv'

# Output file name for the final merged dataset
MERGED_OUTPUT_FILE = 'merged_daily_health_esg_final.xlsx'

# --- Main Script ---

# 1. Load the primary dataset (more detailed)
print(f"--- Step 1: Loading Primary Data from '{PRIMARY_FILE_PATH}' ---")
df1 = pd.DataFrame() # Initialize empty DataFrame for robustness
try:
    df1 = pd.read_csv(PRIMARY_FILE_PATH)
    # Standardize column names to an internal format
    df1.rename(columns={'ESG_Total_Score': 'ESG', 'E-Score': 'E', 'S-Score': 'S', 'G-Score': 'G'}, inplace=True, errors='ignore')
    df1['Date'] = pd.to_datetime(df1['Date'])
    print(f"Loaded {len(df1):,} rows from primary file.")
except FileNotFoundError:
    print(f"Warning: Primary file '{PRIMARY_FILE_PATH}' not found. Proceeding with supplementary file only.")
except Exception as e:
    print(f"Error processing primary file: {e}")


# 2. Load the supplementary dataset
print(f"\n--- Step 2: Loading Supplementary Data from '{SUPPLEMENTARY_FILE_PATH}' ---")
df2 = pd.DataFrame() # Initialize empty DataFrame
try:
    df2 = pd.read_csv(SUPPLEMENTARY_FILE_PATH)
    # Standardize column names to match the primary file's internal format
    df2.rename(columns={'ESG_Score': 'ESG'}, inplace=True, errors='ignore')
    df2['Date'] = pd.to_datetime(df2['Date'])
    print(f"Loaded {len(df2):,} rows from supplementary file.")
except FileNotFoundError:
    print(f"Warning: Supplementary file '{SUPPLEMENTARY_FILE_PATH}' not found.")
except Exception as e:
    print(f"Error processing supplementary file: {e}")

# Check if we have any data to work with
if df1.empty and df2.empty:
    print("\nFATAL ERROR: Both input files are missing or failed to load. Cannot proceed.")
    sys.exit()

# 3. Combine the two daily datasets
print("\n--- Step 3: Merging and De-duplicating Datasets ---")
# Concatenate with the primary file first, so its data is kept during de-duplication
merged_df = pd.concat([df1, df2], ignore_index=True)
print(f"Total rows before filtering and deduplication: {len(merged_df):,}")

# 4. Filter the merged data by the specified date range
start_date = '2019-01-01'
end_date = '2025-06-30'
merged_df = merged_df[(merged_df['Date'] >= start_date) & (merged_df['Date'] <= end_date)]
print(f"Rows after filtering for dates between {start_date} and {end_date}: {len(merged_df):,}")

# 5. De-duplicate, keeping the entry from the first (primary) file
merged_df.sort_values(by=['Ticker', 'Date'], inplace=True)
merged_df.drop_duplicates(subset=['Date', 'Ticker'], keep='first', inplace=True)
print(f"Rows after removing duplicates: {len(merged_df):,}")

# 6. Final formatting and saving
# Ensure the final column order is correct and only includes desired columns
final_columns = ['Date', 'Ticker', 'ESG', 'E', 'S', 'G']
# Filter to only include columns that actually exist in the final DataFrame
final_df = merged_df[[col for col in final_columns if col in merged_df.columns]].copy()

print(f"\n--- Step 4: Saving Final Merged Dataset ---")
final_df.to_excel(MERGED_OUTPUT_FILE, index=False, engine='openpyxl')

print(f"\nSuccessfully merged and saved the final dataset.")
print(f"File '{MERGED_OUTPUT_FILE}' is now available in your Colab environment.")
print("\n--- Final Data Sample (First 5 Rows) ---")
print(final_df.head())
print("\n--- Final Data Sample (Last 5 Rows) ---")
print(final_df.tail())
print(f"\nUnique tickers in final dataset ({final_df['Ticker'].nunique()}): {sorted(final_df['Ticker'].unique())}")

--- Step 1: Loading Primary Data from '/content/daily_pharma_healthcare_esg.csv' ---


  df1['Date'] = pd.to_datetime(df1['Date'])


Loaded 24,840 rows from primary file.

--- Step 2: Loading Supplementary Data from '/content/daily_health_esg_data.csv' ---


  df2['Date'] = pd.to_datetime(df2['Date'])


Loaded 40,912 rows from supplementary file.

--- Step 3: Merging and De-duplicating Datasets ---
Total rows before filtering and deduplication: 65,752
Rows after filtering for dates between 2019-01-01 and 2025-06-30: 52,206
Rows after removing duplicates: 40,341

--- Step 4: Saving Final Merged Dataset ---

Successfully merged and saved the final dataset.
File 'merged_daily_health_esg_final.xlsx' is now available in your Colab environment.

--- Final Data Sample (First 5 Rows) ---
           Date Ticker    ESG     E      S      G
1583 2019-01-01   ABBV  30.35  0.96  16.54  12.84
1584 2019-01-02   ABBV  30.35  0.96  16.54  12.84
1585 2019-01-03   ABBV  30.35  0.96  16.54  12.84
1586 2019-01-04   ABBV  30.35  0.96  16.54  12.84
1587 2019-01-05   ABBV  30.35  0.96  16.54  12.84

--- Final Data Sample (Last 5 Rows) ---
            Date Ticker   ESG   E   S   G
27208 2025-06-26    UNH  16.9 NaN NaN NaN
27209 2025-06-27    UNH  16.9 NaN NaN NaN
27210 2025-06-28    UNH  16.9 NaN NaN NaN
27211