In [42]:
import os
import pandas as pd

In [58]:
input_folder = "/content/drive/MyDrive/Kennedys/Pharmaceuticals"
output_folder = "/content/drive/MyDrive/Cleaned Data/Pharmaceuticals"
os.makedirs(output_folder, exist_ok=True)

In [59]:
# Sheet names to extract
sheet_names = ["Balance Sheet", "Cash Flow", "Profit & Loss", "Quarters"]

In [60]:
def clean_df(df):
    # Clean column names
    df.columns = (
        df.columns
        .astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.replace(r'[^\w\s]', '', regex=True)
        .str.strip()
    )

    # Clean index
    df.index = (
        df.index.astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

    # Clean first column (typically labels)
    if df.shape[1] > 1:
        df.iloc[:, 0] = (
            df.iloc[:, 0].astype(str)
            .str.replace(r'\+', '', regex=True)
            .str.replace(r'%', '', regex=True)
            .str.replace(r'\s+', ' ', regex=True)
            .str.strip()
        )

    # Remove all % signs from entire DataFrame
    df = df.replace(r'%', '', regex=True)

    # Convert columns (except first) to numeric
    for col in df.columns[1:]:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Transpose the DataFrame
    df = df.transpose()

    # Convert row index into column temporarily
    df.reset_index(inplace=True)

    # Drop columns with more than 50% missing values
    df = df.dropna(thresh=0.5 * len(df), axis=1)

    # Restore the original index (first column becomes new index)
    df.set_index(df.columns[0], inplace=True)

    return df


In [61]:
# Process and save one Excel file
def process_excel_file(file_path):
    company_name = os.path.splitext(os.path.basename(file_path))[0]
    print(f"\n📂 Processing: {company_name}")

    try:
        xls = pd.ExcelFile(file_path)
        for sheet in sheet_names:
            if sheet in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet)
                df = clean_df(df)

                # Save to CSV
                sheet_clean_name = sheet.replace(' ', '_')
                output_file = os.path.join(output_folder, f"{company_name}_{sheet_clean_name}.csv")
                df.to_csv(output_file, index=True)
                print(f"✅ Saved: {output_file}")

                # Display first 10 rows
                print(f"\n📄 Cleaned Sheet: {company_name} - {sheet}")
                print(df.head(10).to_string(index=True))  # Pretty print

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")


In [62]:
# Loop through Excel files
for filename in os.listdir(input_folder):
    if filename.endswith(('.xlsx', '.xls')) and not filename.startswith('~$'):
        file_path = os.path.join(input_folder, filename)
        process_excel_file(file_path)


📂 Processing: SUNPHARMA
✅ Saved: /content/drive/MyDrive/Cleaned Data/Pharmaceuticals/SUNPHARMA_Balance_Sheet.csv

📄 Cleaned Sheet: SUNPHARMA - Balance Sheet
                        0         1           2                  3                  4             5     6            7             8             9
index                                                                                                                                             
Unnamed 0  Equity Capital  Reserves  Borrowings  Other Liabilities  Total Liabilities  Fixed Assets  CWIP  Investments  Other Assets  Total Assets
Mar 2013              104     14886         260               5128              20377          5647   563         2412         11756         20377
Mar 2014              207     18318        2561               8009              29095          6817   842         2786         18650         29095
Mar 2015              207     25431        8996              14089              48723         12682  2039  

In [None]:
import os
import pandas as pd

input_folder = "/content/drive/MyDrive/Kennedys/Chemicals"
output_folder = "/content/drive/MyDrive/Cleaned Data/Chemicals"
os.makedirs(output_folder, exist_ok=True)

expected_quarter_sheet = "Quarters"  # Sheet to process


def clean_df(df):
    # Clean column names
    df.columns = (
        df.columns.astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.replace(r'[^\w\s]', '', regex=True)
        .str.strip()
    )

    # Clean index
    df.index = (
        df.index.astype(str)
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

    # Clean first column if labels like 'Sales +'
    if df.shape[1] > 1:
        df.iloc[:, 0] = (
            df.iloc[:, 0].astype(str)
            .str.replace(r'\+', '', regex=True)
            .str.replace(r'%', '', regex=True)
            .str.replace(r'\s+', ' ', regex=True)
            .str.strip()
        )

    # Remove % from all values
    df = df.replace(r'%', '', regex=True)

    # Convert all other columns to numeric
    for col in df.columns[1:]:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Transpose the DataFrame
    df = df.transpose()

    return df


# Process only "Quarters" sheet if it exists
def process_quarters_sheet(file_path):
    company_name = os.path.splitext(os.path.basename(file_path))[0]
    print(f"\n📂 Checking: {company_name}")

    try:
        xls = pd.ExcelFile(file_path)
        if expected_quarter_sheet in xls.sheet_names:
            print(f"✅ Found 'Quarters' in: {company_name}")
            df = pd.read_excel(xls, sheet_name=expected_quarter_sheet)
            df = clean_df(df)

            output_file = os.path.join(output_folder, f"{company_name}_Quarters.csv")
            df.to_csv(output_file, index=True)
            print(f"💾 Saved: {output_file}")
            print(df.head(10).to_string(index=True))  # Display cleaned data
        else:
            print(f"❌ 'Quarters' sheet not found in {company_name}")

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")


# Loop through Excel files and process "Quarters" sheet only
for filename in os.listdir(input_folder):
    if filename.endswith(('.xlsx', '.xls')) and not filename.startswith('~$'):
        file_path = os.path.join(input_folder, filename)
        process_quarters_sheet(file_path)



📂 Checking: UPL
✅ Found 'Quarters' in: UPL
💾 Saved: /content/drive/MyDrive/Cleaned Data/Chemicals/UPL_Quarters.csv
                 0         1                 2     3             4         5             6                  7      8           9         10       11
Unnamed 0    Sales  Expenses  Operating Profit   OPM  Other Income  Interest  Depreciation  Profit before tax    Tax  Net Profit  EPS in Rs  Raw PDF
Mar 2022   15861.0   12481.0            3380.0  21.0          62.0     800.0         642.0             2000.0   13.0      1735.0      16.04      NaN
Jun 2022   10821.0    8675.0            2146.0  20.0          25.0     519.0         588.0             1064.0    6.0      1005.0      10.39      NaN
Sep 2022   12507.0   10090.0            2417.0  19.0          35.0     644.0         608.0             1200.0   19.0       969.0       9.64      NaN
Dec 2022   13679.0   10795.0            2884.0  21.0         129.0     894.0         624.0             1495.0    9.0      1360.0      12.87