In [6]:
import os
import pandas as pd

In [12]:
input_folder = "/content/drive/MyDrive/Company_Raw_Data/Energy"
output_folder = "/content/drive/MyDrive/Cleaned Data/Energy"
os.makedirs(output_folder, exist_ok=True)

In [13]:
# Sheet names to extract
sheet_names = ["Balance Sheet", "Cash Flow", "Profit & Loss", "Quarters"]

In [14]:
def clean_df(df):
    # Clean column names
    df.columns = (
        df.columns
        .astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.replace(r'[^\w\s]', '', regex=True)
        .str.strip()
    )

    # Clean index
    df.index = (
        df.index.astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

    # Clean first column (typically labels)
    if df.shape[1] > 1:
        df.iloc[:, 0] = (
            df.iloc[:, 0].astype(str)
            .str.replace(r'\+', '', regex=True)
            .str.replace(r'%', '', regex=True)
            .str.replace(r'\s+', ' ', regex=True)
            .str.strip()
        )

    # Remove all % signs from entire DataFrame
    df = df.replace(r'%', '', regex=True)

    # Convert columns (except first) to numeric
    for col in df.columns[1:]:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')


    # Convert row index into column temporarily
    df.reset_index(inplace=True)

    # Rename first column (previous row labels) to 'Micro_Data'
    df.rename(columns={'Unnamed 0': 'Financial_Metric'}, inplace=True)

    # Drop columns with more than 50% missing values
    df = df.dropna(thresh=0.5 * len(df), axis=1)

    # Restore the original index (first column becomes new index)
    df.set_index(df.columns[0], inplace=True)

    return df


In [15]:
# Process and save one Excel file
def process_excel_file(file_path):
    company_name = os.path.splitext(os.path.basename(file_path))[0]
    print(f"\n📂 Processing: {company_name}")

    try:
        xls = pd.ExcelFile(file_path)
        for sheet in sheet_names:
            if sheet in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet)
                df = clean_df(df)

                # Save to CSV
                sheet_clean_name = sheet.replace(' ', '_')
                output_file = os.path.join(output_folder, f"{company_name}_{sheet_clean_name}.csv")
                df.to_csv(output_file, index=False)
                print(f"✅ Saved: {output_file}")

                # Display first 10 rows
                print(f"\n📄 Cleaned Sheet: {company_name} - {sheet}")
                print(df.head(10).to_string(index=False))  # Pretty print

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")


In [16]:
# Loop through Excel files
for filename in os.listdir(input_folder):
    if filename.endswith(('.xlsx', '.xls')) and not filename.startswith('~$'):
        file_path = os.path.join(input_folder, filename)
        process_excel_file(file_path)


📂 Processing: RELIANCE
✅ Saved: /content/drive/MyDrive/Cleaned Data/Energy/RELIANCE_Balance_Sheet.csv

📄 Cleaned Sheet: RELIANCE - Balance Sheet
 Financial_Metric  Mar 2014  Mar 2015  Mar 2016  Mar 2017  Mar 2018  Mar 2019  Mar 2020  Mar 2021  Mar 2022  Mar 2023  Mar 2024  Mar 2025
   Equity Capital      2940      2943      2948      2959      5922      5926      6339      6445      6765      6766      6766     13532
         Reserves    195747    215556    228608    260750    287584    381186    442827    693727    772720    709106    786715    829668
       Borrowings    138761    168251    194714    217475    239843    307714    355133    278962    319158    451664    458991    369575
Other Liabilities     91395    117736    172727    225618    277924    302804    358716    340931    399979    438346    502576    737346
Total Liabilities    428843    504486    598997    706802    811273    997630   1163015   1320065   1498622   1605882   1755048   1950121
     Fixed Assets    14141