In [22]:
import os
import pandas as pd

In [63]:
input_folder = "/content/drive/MyDrive/Company_Raw_Data/Textiles"
output_folder = "/content/drive/MyDrive/Cleaned Data/Textiles"
os.makedirs(output_folder, exist_ok=True)

In [64]:
# Sheet names to extract
sheet_names = ["Balance Sheet", "Cash Flow", "Profit & Loss", "Quarters"]

In [65]:
def clean_df(df):
    # Clean column names
    df.columns = (
        df.columns
        .astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.replace(r'[^\w\s]', '', regex=True)
        .str.strip()
    )

    # Clean index
    df.index = (
        df.index.astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

    # Clean first column (typically labels)
    if df.shape[1] > 1:
        df.iloc[:, 0] = (
            df.iloc[:, 0].astype(str)
            .str.replace(r'\+', '', regex=True)
            .str.replace(r'%', '', regex=True)
            .str.replace(r'\s+', ' ', regex=True)
            .str.strip()
        )

    # Remove all % signs from entire DataFrame
    df = df.replace(r'%', '', regex=True)

    # Convert columns (except first) to numeric
    for col in df.columns[1:]:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')


    # Convert row index into column temporarily
    df.reset_index(inplace=True)

    # Rename first column (previous row labels) to 'Micro_Data'
    df.rename(columns={'Unnamed 0': 'Financial_Metric'}, inplace=True)

    # Drop columns with more than 50% missing values
    df = df.dropna(thresh=0.5 * len(df), axis=1)

    # Restore the original index (first column becomes new index)
    df.set_index(df.columns[0], inplace=True)

    return df


In [66]:
# Process and save one Excel file
def process_excel_file(file_path):
    company_name = os.path.splitext(os.path.basename(file_path))[0]
    print(f"\n📂 Processing: {company_name}")

    try:
        xls = pd.ExcelFile(file_path)
        for sheet in sheet_names:
            if sheet in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet)
                df = clean_df(df)

                # Save to CSV
                sheet_clean_name = sheet.replace(' ', '_')
                output_file = os.path.join(output_folder, f"{company_name}_{sheet_clean_name}.csv")
                df.to_csv(output_file, index=False)
                print(f"✅ Saved: {output_file}")

                # Display first 10 rows
                print(f"\n📄 Cleaned Sheet: {company_name} - {sheet}")
                print(df.head(10).to_string(index=False))  # Pretty print

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")


In [67]:
# Loop through Excel files
for filename in os.listdir(input_folder):
    if filename.endswith(('.xlsx', '.xls')) and not filename.startswith('~$'):
        file_path = os.path.join(input_folder, filename)
        process_excel_file(file_path)


📂 Processing: ARVIND
✅ Saved: /content/drive/MyDrive/Cleaned Data/Textiles/ARVIND_Balance_Sheet.csv

📄 Cleaned Sheet: ARVIND - Balance Sheet
 Financial_Metric  Mar 2013  Mar 2014  Mar 2015  Mar 2016  Mar 2017  Mar 2018  Mar 2019  Mar 2020  Mar 2021  Mar 2022  Mar 2023  Mar 2024  Sep 2024
   Equity Capital       258       258       258       258       258       259       259       259       259       261       262       262       262
         Reserves      1996      2325      2466      2388      3309      3524      2492      2450      2460      2690      3084      3281      3249
       Borrowings      2461      2992      3397      3819      2926      3323      2700      2640      2121      1865      1517      1448      1560
Other Liabilities      1519      1788      1855      1659      2032      3082      1872      1790      1872      2881      2035      2254      2392
Total Liabilities      6233      7363      7976      8125      8525     10188      7322      7138      6713      7697 