In [1]:
import os
import pandas as pd

def process_balance_sheet(file_path):
    return process_yearly_file(file_path, "balance_sheet")

def process_cash_flows(file_path):
    return process_yearly_file(file_path, "cash_flows")

def process_profit_and_loss(file_path):
    return process_yearly_file(file_path, "profit_and_loss")

def process_quarterly_results(file_path):
    return process_quarterly_file(file_path)

def process_yearly_file(file_path, data_type):
    df = pd.read_csv(file_path, index_col=0)
    max_col_index = len(df.columns) - 1
    processed_data = {}
    
    for row_name in df.index:
        for col_index, col_name in enumerate(reversed(df.columns)):
            year_label = f"{row_name}_{data_type}_year_{col_index+1}"
            processed_data[year_label] = df.at[row_name, col_name] if col_index <= max_col_index else None
    
    return processed_data

def process_quarterly_file(file_path):
    df = pd.read_csv(file_path, index_col=0)
    max_col_index = len(df.columns) - 1
    processed_data = {}
    
    for row_name in df.index:
        for col_index, col_name in enumerate(reversed(df.columns)):
            quarter_label = f"{row_name}_quarter_{col_index+1}"
            processed_data[quarter_label] = df.at[row_name, col_name] if col_index <= max_col_index else None
    
    return processed_data

def main(input_folder, output_csv):
    all_data = []
    
    for company_folder in os.listdir(input_folder):
        company_path = os.path.join(input_folder, company_folder)
        if not os.path.isdir(company_path):
            continue
        
        company_data = {'CompanyName': company_folder}
        
        for file_name in os.listdir(company_path):
            file_path = os.path.join(company_path, file_name)
            if "balance_sheet" in file_name:
                company_data.update(process_balance_sheet(file_path))
            elif "cash_flows" in file_name:
                company_data.update(process_cash_flows(file_path))
            elif "profit_&_loss" in file_name:
                company_data.update(process_profit_and_loss(file_path))
            elif "quarterly_results" in file_name:
                company_data.update(process_quarterly_results(file_path))
        
        all_data.append(company_data)
    
    result_df = pd.DataFrame(all_data)
    result_df.to_csv(output_csv, index=False)

if __name__ == "__main__":
    input_folder = "/Users/hemantg/Downloads/cos. try"
    output_csv = "output-cos-try.csv"
    main(input_folder, output_csv)


In [3]:
import pandas as pd
import re

def clean_columns(input_file_path, output_file_path):
    # Read the CSV file
    df = pd.read_csv(input_file_path, sep=',')

    # Columns to clean
    columns_to_clean = ['GMP', 'IPO Price', 'IPO Size']

    # Function to clean data
    def strip_to_digit(value):
        if isinstance(value, str):
            match = re.search(r'\d+', value.replace(',', ''))
            return match.group(0) if match else value
        return value

    # Apply cleaning
    for col in columns_to_clean:
        df[col] = df[col].apply(strip_to_digit)

    # Save the cleaned DataFrame to a new file
    df.to_csv(output_file_path, index=False, sep=',')
    print(f"Cleaned file saved at: {output_file_path}")

# Define file paths
input_file_path = '/Users/hemantg/Desktop/Finance Sem5/mainboard_ipo_final.csv'
output_file_path = '/Users/hemantg/Desktop/Finance Sem5/mainboard_ipo_final_cleaned.csv'

# Clean the CSV file
clean_columns(input_file_path, output_file_path)


Cleaned file saved at: /Users/hemantg/Desktop/Finance Sem5/mainboard_ipo_final_cleaned.csv


In [4]:
import os
import csv

def list_subfolders_to_csv(folder_path, output_csv):
    # Get all subfolders in the specified folder
    subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir()]
    
    # Write the subfolders to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Subfolder Name"])
        for subfolder in subfolders:
            writer.writerow([subfolder])

    print(f"Subfolder names have been saved to {output_csv}")

# Example usage
folder_path = "/Users/hemantg/Documents/final-companies-dataset"  # Replace with your folder path
output_csv = "subfolders.csv"
list_subfolders_to_csv(folder_path, output_csv)


Subfolder names have been saved to subfolders.csv


In [5]:
import os

def rename_csv_files(parent_folder):
    for subfolder in os.listdir(parent_folder):
        subfolder_path = os.path.join(parent_folder, subfolder)
        
        if os.path.isdir(subfolder_path):
            csv_files = os.listdir(subfolder_path)
            
            for csv_file in csv_files:
                file_path = os.path.join(subfolder_path, csv_file)
                
                if csv_file.startswith("balance_sheet"):
                    new_file_name = f"{subfolder}_balance_sheet.csv"
                elif csv_file.startswith("cash_flows"):
                    new_file_name = f"{subfolder}_cash_flows.csv"
                elif csv_file.startswith("profit_&_loss"):
                    new_file_name = f"{subfolder}_profit_&_loss.csv"
                elif csv_file.startswith("quarterly_results") or csv_file.endswith("quarterly_results.csv"):
                    new_file_name = f"{subfolder}_quarterly_results.csv"
                else:
                    continue
                
                new_file_path = os.path.join(subfolder_path, new_file_name)
                os.rename(file_path, new_file_path)
                print(f"Renamed: {file_path} -> {new_file_path}")

# Example usage
parent_folder = "/Users/hemantg/Documents/final-companies-dataset"
rename_csv_files(parent_folder)


Renamed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_quarterly_results.csv -> /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_quarterly_results.csv
Renamed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_quarterly_results.csv -> /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_quarterly_results.csv
Renamed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/quarterly_results.csv -> /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_quarterly_results.csv
Renamed: /Users/hemantg/Documents/final-companies-dataset/CYIENTDLM/CYIENTDLM_quarterly_results.csv -> /Users/hemantg/Documents/final-companies-dataset/CYIENTDLM/CYIENTDLM_quarterly_results.csv
Renamed: /Users/hemantg/Documents/final-companies-dataset/IREDA/IREDA_quarterly_results.csv -> /Users/hemantg/Documents/final-companies-dataset/IREDA/IREDA_quarterly_results.csv
Renamed: /Users/hemantg/Documents/final-companies-dataset

In [6]:
import os
import pandas as pd
import re

# Define the path to the main folder
main_folder = "/Users/hemantg/Documents/final-companies-dataset"

# Function to clean row names
def clean_row_name(name):
    return re.sub(r'[^a-zA-Z]', '', name)

# Loop through each subfolder
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    
    if os.path.isdir(subfolder_path):
        # Process each CSV file in the subfolder
        for csv_file in os.listdir(subfolder_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(subfolder_path, csv_file)
                
                # Read the CSV file
                df = pd.read_csv(file_path)
                
                # Clean row names
                df.index = [clean_row_name(str(index)) for index in df.index]
                
                # Save back to the same file
                df.to_csv(file_path, index=True)
                print(f"Processed: {file_path}")

print("Processing complete.")


Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_quarterly_results.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_cash_flows.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_quarterly_results.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_cash_flows.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_quarterly_results.csv
Processed: /Users/

In [7]:
import os
import pandas as pd
import re

# Define the path to the main folder
main_folder = "/Users/hemantg/Documents/final-companies-dataset"

# Function to clean row names
def clean_row_name(name):
    return re.sub(r'[^a-zA-Z\s]', '', name).strip()

# Loop through each subfolder
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    
    if os.path.isdir(subfolder_path):
        # Process each CSV file in the subfolder
        for csv_file in os.listdir(subfolder_path):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(subfolder_path, csv_file)
                
                # Read the CSV file
                df = pd.read_csv(file_path, index_col=0)
                
                # Clean row names
                df.index = [clean_row_name(str(index)) for index in df.index]
                
                # Save back to the same file
                df.to_csv(file_path, index=True)
                print(f"Processed: {file_path}")

print("Processing complete.")


Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_quarterly_results.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_cash_flows.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_quarterly_results.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_cash_flows.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_quarterly_results.csv
Processed: /Users/

In [9]:
import os
import pandas as pd

# Define the root directory where the main folder is located
root_dir = '/Users/hemantg/Documents/final-companies-dataset'

# Loop through each subfolder in the root directory
for subfolder in os.listdir(root_dir):
    subfolder_path = os.path.join(root_dir, subfolder)
    
    # Check if it's a directory
    if os.path.isdir(subfolder_path):
        # Define file names based on the subfolder name
        target_files = [f"{subfolder}_profit_&_loss.csv", f"{subfolder}_quarterly_results.csv"]
        
        for file_name in target_files:
            file_path = os.path.join(subfolder_path, file_name)
            
            # Check if file exists
            if os.path.isfile(file_path):
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path, index_col=0)
                
                # Check if 'Sales' exists as a row name
                if 'Sales' in df.index:
                    # Rename row 'Sales' to 'Revenue'
                    df.rename(index={'Sales': 'Revenue'}, inplace=True)
                    # Save the modified DataFrame back to CSV
                    df.to_csv(file_path)

print("Completed renaming process.")


Completed renaming process.


In [14]:
import os
import pandas as pd

# Define the root folder containing subfolders
root_folder = "/Users/hemantg/Documents/final-companies-dataset"

# Iterate through each subfolder
for subfolder in os.listdir(root_folder):
    subfolder_path = os.path.join(root_folder, subfolder)
    
    # Check if it's a directory
    if os.path.isdir(subfolder_path):
        # Construct the expected CSV file name
        csv_file = os.path.join(subfolder_path, f"{subfolder}_profit_&_loss.csv")
        
        # Check if the file exists
        if os.path.isfile(csv_file):
            # Read the CSV file
            df = pd.read_csv(csv_file)
            
            # Check if 'TTM' column exists and remove it
            if 'TTM' in df.columns:
                df.drop('TTM', axis=1, inplace=True)
                
                # Ensure the top-left cell is blank
                df.iloc[0, 0] = ''
                
                # Save the updated DataFrame back to CSV
                df.to_csv(csv_file, index=False)
                print(f"Updated: {csv_file}")
            else:
                print(f"'TTM' column not found in {csv_file}")
        else:
            print(f"File not found: {csv_file}")


'TTM' column not found in /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/CYIENTDLM/CYIENTDLM_profit_&_loss.csv
'TTM' column not found in /Users/hemantg/Documents/final-companies-dataset/IREDA/IREDA_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/UNIPARTS/UNIPARTS_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/JNKINDIA/JNKINDIA_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/AGSTRA/AGSTRA_profit_&_loss.csv
'TTM' column not found in /Users/hemantg/Documents/final-companies-dataset/OLAELEC/OLAELEC_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/KALYANKJIL/KALYANKJIL_profit_&_loss.csv
Updated: 

In [13]:
import os
import pandas as pd

# Define rows to keep for each file type
rows_to_keep = {
    "balance_sheet": ["Equity Capital", "Reserves", "Borrowings", "Total Liabilities", "Fixed Assets", "Total Assets"],
    "cash_flows": ["Cash from Operating Activity", "Cash from Investing Activity", "Cash from Financing Activity", "Net Cash Flow"],
    "profit_&_loss": ["Revenue", "Expenses", "Net Profit", "EPS in Rs"],
    "quarterly_results": ["Revenue", "Expenses", "Net Profit", "EPS in Rs"]
}

# Path to the main folder
main_folder = "/Users/hemantg/Documents/final-companies-dataset"

# Loop through each subfolder
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    
    if os.path.isdir(subfolder_path):
        # Process each CSV file in the subfolder
        for file_name in os.listdir(subfolder_path):
            file_path = os.path.join(subfolder_path, file_name)
            
            # Determine file type from the name
            for file_type, keep_rows in rows_to_keep.items():
                if file_type in file_name.lower():
                    # Load the CSV file
                    df = pd.read_csv(file_path)
                    
                    # Filter rows and reset the top-left corner to blank
                    df_filtered = df[df.iloc[:, 0].isin(keep_rows)]
                    
                    # Ensure correct header handling
                    df_filtered.columns.values[0] = ""
                    
                    # Save the filtered CSV back
                    df_filtered.to_csv(file_path, index=False)
                    print(f"Processed {file_path}")


Processed /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_quarterly_results.csv
Processed /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_profit_&_loss.csv
Processed /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_balance_sheet.csv
Processed /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_cash_flows.csv
Processed /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_quarterly_results.csv
Processed /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_profit_&_loss.csv
Processed /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_balance_sheet.csv
Processed /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_cash_flows.csv
Processed /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_balance_sheet.csv
Processed /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_quarterly_results.csv
Processed /Users/hemantg/Doc

In [15]:
import os
import pandas as pd

def remove_ttm_column(base_folder):
    # Walk through all subfolders in the base folder
    for subfolder in os.listdir(base_folder):
        subfolder_path = os.path.join(base_folder, subfolder)
        
        # Check if it's a directory
        if os.path.isdir(subfolder_path):
            # Look for CSV files with 'profit_&_loss' in the name
            for filename in os.listdir(subfolder_path):
                if '_profit_&_loss.csv' in filename:
                    file_path = os.path.join(subfolder_path, filename)
                    
                    try:
                        # Read the CSV file
                        df = pd.read_csv(file_path)
                        
                        # Remove the 'TTM' column if it exists
                        if 'TTM' in df.columns:
                            df = df.drop(columns=['TTM'])
                            
                            # Save the modified CSV, overwriting the original
                            df.to_csv(file_path, index=False)
                            print(f"Processed: {file_path}")
                        else:
                            print(f"No 'TTM' column found in: {file_path}")
                    
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

# Example usage
# Replace 'your_base_folder_path' with the actual path to your base folder
base_folder_path = '/Users/hemantg/Documents/final-companies-dataset'
remove_ttm_column(base_folder_path)

No 'TTM' column found in: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CYIENTDLM/CYIENTDLM_profit_&_loss.csv
No 'TTM' column found in: /Users/hemantg/Documents/final-companies-dataset/IREDA/IREDA_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/UNIPARTS/UNIPARTS_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/JNKINDIA/JNKINDIA_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/AGSTRA/AGSTRA_profit_&_loss.csv
No 'TTM' column found in: /Users/hemantg/Documents/final-companies-dataset/OLAELEC/OLAELEC_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/KALYANKJIL/KALYANKJIL_profit_&_loss

In [18]:
import os
import pandas as pd

def process_balance_sheet(df):
    """Process balance sheet CSV"""
    keep_rows = ['Equity Capital', 'Reserves', 'Borrowings', 'Total Liabilities', 'Fixed Assets', 'Total Assets']
    return df[df['Unnamed: 0'].isin(keep_rows)].set_index('Unnamed: 0')

def process_cash_flows(df):
    """Process cash flows CSV"""
    keep_rows = ['Cash from Operating Activity', 'Cash from Investing Activity', 
                 'Cash from Financing Activity', 'Net Cash Flow']
    return df[df['Unnamed: 0'].isin(keep_rows)].set_index('Unnamed: 0')

def process_profit_loss(df):
    """Process profit and loss CSV"""
    keep_rows = ['Revenue', 'Expenses', 'Net Profit', 'EPS in Rs']
    return df[df['Unnamed: 0'].isin(keep_rows)].set_index('Unnamed: 0')

def process_quarterly_results(df):
    """Process quarterly results CSV"""
    keep_rows = ['Revenue', 'Expenses', 'Net Profit', 'EPS in Rs']
    return df[df['Unnamed: 0'].isin(keep_rows)].set_index('Unnamed: 0')

def process_company_folder(folder_path):
    """Process CSV files in a company folder"""
    # Define file mapping
    file_processors = {
        'balance_sheet': process_balance_sheet,
        'cash_flows': process_cash_flows,
        'profit_&_loss': process_profit_loss,
        'quarterly_results': process_quarterly_results
    }

    # Iterate through CSV files
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            # Determine which processor to use
            processor = None
            for key, proc in file_processors.items():
                if key in filename.lower():
                    processor = proc
                    break
            
            if not processor:
                print(f"No processor found for {filename}")
                continue

            # Full file path
            file_path = os.path.join(folder_path, filename)
            
            # Read CSV
            try:
                df = pd.read_csv(file_path, index_col=False)
                
                # Rename columns if needed (remove multi-index if present)
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = df.columns.get_level_values(-1)
                
                # Process the dataframe
                processed_df = processor(df)
                
                # Save processed CSV
                processed_df.to_csv(file_path)
                print(f"Processed {filename}")
            
            except Exception as e:
                print(f"Error processing {filename}: {e}")

def main():
    """Main function to process all company folders"""
    # MODIFY THIS LINE TO SPECIFY YOUR FOLDER PATH
    base_dir = r"/Users/hemantg/Documents/final-companies-dataset"
    
    # Iterate through subfolders
    for folder_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder_name)
        
        # Check if it's a directory
        if os.path.isdir(folder_path):
            print(f"\nProcessing folder: {folder_name}")
            process_company_folder(folder_path)

if __name__ == "__main__":
    main()


Processing folder: CEIGALL
Processed CEIGALL_quarterly_results.csv
Processed CEIGALL_profit_&_loss.csv
Processed CEIGALL_balance_sheet.csv
Processed CEIGALL_cash_flows.csv

Processing folder: MAPMYINDIA
Processed MAPMYINDIA_quarterly_results.csv
Processed MAPMYINDIA_profit_&_loss.csv
Processed MAPMYINDIA_balance_sheet.csv
Processed MAPMYINDIA_cash_flows.csv

Processing folder: SHRIRAMPPS
Processed SHRIRAMPPS_balance_sheet.csv
Processed SHRIRAMPPS_quarterly_results.csv
Processed SHRIRAMPPS_cash_flows.csv
Processed SHRIRAMPPS_profit_&_loss.csv

Processing folder: CYIENTDLM
Processed CYIENTDLM_cash_flows.csv
Processed CYIENTDLM_profit_&_loss.csv
Processed CYIENTDLM_quarterly_results.csv
Processed CYIENTDLM_balance_sheet.csv

Processing folder: IREDA
Processed IREDA_cash_flows.csv
Processed IREDA_balance_sheet.csv
Processed IREDA_quarterly_results.csv
No processor found for IREDA_ratios.csv
Processed IREDA_profit_&_loss.csv

Processing folder: UNIPARTS
Processed UNIPARTS_quarterly_results

In [17]:
import os
import pandas as pd

# Define the root directory where the main folder is located
root_dir = '/Users/hemantg/Documents/final-companies-dataset'

# Loop through each subfolder in the root directory
for subfolder in os.listdir(root_dir):
    subfolder_path = os.path.join(root_dir, subfolder)
    
    # Check if it's a directory
    if os.path.isdir(subfolder_path):
        # Define file names based on the subfolder name
        target_files = [f"{subfolder}_profit_&_loss.csv", f"{subfolder}_quarterly_results.csv"]
        
        for file_name in target_files:
            file_path = os.path.join(subfolder_path, file_name)
            
            # Check if file exists
            if os.path.isfile(file_path):
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path, index_col=0)
                
                # Check if 'Sales' exists as a row name
                if 'Sales' in df.index:
                    # Rename row 'Sales' to 'Revenue'
                    df.rename(index={'Sales': 'Revenue'}, inplace=True)
                    # Save the modified DataFrame back to CSV
                    df.to_csv(file_path)

print("Completed renaming process.")


Completed renaming process.


In [19]:
import os
import pandas as pd

def remove_empty_columns(input_path):
    """
    Remove empty columns from the right side of CSV files in a given directory.
    
    Parameters:
    input_path (str): Path to the directory containing CSV files
    """
    # Iterate through all subfolders in the input path
    for company_folder in os.listdir(input_path):
        company_path = os.path.join(input_path, company_folder)
        
        # Skip if not a directory
        if not os.path.isdir(company_path):
            continue
        
        # Find all CSV files in the folder
        csv_files = [f for f in os.listdir(company_path) if f.endswith('.csv')]
        
        for csv_file in csv_files:
            # Full path to the CSV file
            file_path = os.path.join(company_path, csv_file)
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Remove empty columns from the right
            while df.columns.size > 0:
                # Check the rightmost column
                rightmost_col = df.columns[-1]
                
                # If the column is empty (only NaN or empty values)
                if df[rightmost_col].isna().all() or (df[rightmost_col] == '').all():
                    df = df.drop(columns=[rightmost_col])
                else:
                    # Stop if we find a non-empty column
                    break
            
            # Save the modified CSV
            df.to_csv(file_path, index=False)
            print(f"Processed: {file_path}")

# Example usage
# Replace 'YOUR_ROOT_FOLDER_PATH' with the actual path to your root folder
input_path = '/Users/hemantg/Documents/final-companies-dataset'
remove_empty_columns(input_path)

Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_quarterly_results.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_cash_flows.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_quarterly_results.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_profit_&_loss.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_cash_flows.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_balance_sheet.csv
Processed: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_quarterly_results.csv
Processed: /Users/

In [21]:
import os
import pandas as pd

def filter_march_columns(input_folder):
    # List of file types to process
    file_types = [
        'balance_sheet',
        'cash_flows', 
        'profit_&_loss'
    ]
    
    # Iterate through all subfolders in the input folder
    for company_folder in os.listdir(input_folder):
        company_path = os.path.join(input_folder, company_folder)
        
        # Skip if not a directory
        if not os.path.isdir(company_path):
            continue
        
        # Process each file type
        for file_type in file_types:
            # Construct full file path
            filename = f"{company_folder}_{file_type}.csv"
            file_path = os.path.join(company_path, filename)
            
            # Check if file exists
            if not os.path.exists(file_path):
                print(f"File not found: {file_path}")
                continue
            
            # Read the CSV
            df = pd.read_csv(file_path)
            
            # Filter columns to keep only those with 'Mar' in the name
            march_columns = ['Unnamed: 0'] + [col for col in df.columns if 'Mar' in str(col)]
            filtered_df = df[march_columns]
            
            # Overwrite the existing CSV
            filtered_df.to_csv(file_path, index=False)
            
            print(f"Updated: {file_path}")

# Example usage
input_folder = '/Users/hemantg/Documents/final-companies-dataset'
filter_march_columns(input_folder)

Updated: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_balance_sheet.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_cash_flows.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/CEIGALL/CEIGALL_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_balance_sheet.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_cash_flows.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/MAPMYINDIA/MAPMYINDIA_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_balance_sheet.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_cash_flows.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/SHRIRAMPPS/SHRIRAMPPS_profit_&_loss.csv
Updated: /Users/hemantg/Documents/final-companies-dataset/CYIENTDLM/CYIENTDLM_balance_sheet.csv
Updated: /Users/hemantg/Documents/final-companies

In [26]:
import os
import pandas as pd

def process_balance_sheet(file_path):
    return process_yearly_file(file_path, "balance_sheet")

def process_cash_flows(file_path):
    return process_yearly_file(file_path, "cash_flows")

def process_profit_and_loss(file_path):
    return process_yearly_file(file_path, "profit_and_loss")

def process_quarterly_results(file_path):
    return process_quarterly_file(file_path)

def process_yearly_file(file_path, data_type):
    df = pd.read_csv(file_path, index_col=0)
    max_col_index = len(df.columns) - 1
    processed_data = {}
    
    for row_name in df.index:
        for col_index, col_name in enumerate(reversed(df.columns)):
            year_label = f"{row_name}{data_type}_year{col_index+1}"
            processed_data[year_label] = df.at[row_name, col_name] if col_index <= max_col_index else None
    
    return processed_data

def process_quarterly_file(file_path):
    df = pd.read_csv(file_path, index_col=0)
    max_col_index = len(df.columns) - 1
    processed_data = {}
    
    for row_name in df.index:
        for col_index, col_name in enumerate(reversed(df.columns)):
            quarter_label = f"{row_name}quarter{col_index+1}"
            processed_data[quarter_label] = df.at[row_name, col_name] if col_index <= max_col_index else None
    
    return processed_data

def main(input_folder, output_csv):
    all_data = []
    
    for company_folder in os.listdir(input_folder):
        company_path = os.path.join(input_folder, company_folder)
        if not os.path.isdir(company_path):
            continue
        
        company_data = {'CompanyName': company_folder}
        
        for file_name in os.listdir(company_path):
            file_path = os.path.join(company_path, file_name)
            if "balance_sheet" in file_name:
                company_data.update(process_balance_sheet(file_path))
            elif "cash_flows" in file_name:
                company_data.update(process_cash_flows(file_path))
            elif "profit_&loss" in file_name:
                company_data.update(process_profit_and_loss(file_path))
            elif "quarterly_results" in file_name:
                company_data.update(process_quarterly_results(file_path))
        
        all_data.append(company_data)
    
    result_df = pd.DataFrame(all_data)
    result_df.to_csv(output_csv, index=False)

if __name__ == "__main__":
    input_folder = "/Users/hemantg/Documents/final-companies-dataset"
    output_csv = "/Users/hemantg/Documents/output-final-data.csv"
    main(input_folder, output_csv)

In [27]:
import pandas as pd

# Load both CSV files
csv1_path = '/Users/hemantg/Documents/output-final-data.csv'  # Replace with actual file path
csv2_path = '/Users/hemantg/Documents/sentiment_output_final.csv'  # Replace with actual file path

df_original = pd.read_csv(csv1_path, sep=',')  # Assuming tab-separated

df_sentiment = pd.read_csv(csv2_path, sep=',')

# Standardize company name columns
original_company_col = 'CompanyName'
sentiment_company_col = 'Company Name'

df_original[original_company_col] = df_original[original_company_col].str.strip().str.upper()
df_sentiment[sentiment_company_col] = df_sentiment[sentiment_company_col].str.strip().str.upper()

# Merge dataframes on the company name columns
merged_df = pd.merge(df_original, df_sentiment, 
                     left_on=original_company_col, 
                     right_on=sentiment_company_col, 
                     how='left')

# Drop duplicate company name column from the second CSV
merged_df.drop(columns=[sentiment_company_col], inplace=True)

# Save the updated CSV
output_path = '/Users/hemantg/Documents/merged_output.csv'  # Replace with desired file path
merged_df.to_csv(output_path, index=False, sep=',')

print(f'Merged file saved at {output_path}')


Merged file saved at /Users/hemantg/Documents/merged_output.csv


In [29]:
import pandas as pd

# Load the CSV files
csv1 = pd.read_csv('/Users/hemantg/Documents/merged_output.csv')
csv2 = pd.read_csv('/Users/hemantg/Downloads/cleaned_data.csv')

# Standardize column names to simplify matching
# Standardize column names to simplify matching
csv1['CompanyName'] = csv1['CompanyName'].str.strip().str.upper()
csv2['NSE Ticker'.strip()] = csv2['NSE Ticker'.strip()].str.strip().str.upper()

# Merge the dataframes based on the matching columns
merged_df = pd.merge(csv1, csv2[['NSE Ticker', 'IPO Size', 'Subscription', 'GMP', 'IPO Price', 'Listing Price Value', 'Closing Price Value', 'Outstanding Shares', 'Market Cap']], 
                     left_on='CompanyName', right_on='NSE Ticker', how='left')

# Drop the redundant column from the merge
merged_df.drop('NSE Ticker', axis=1, inplace=True)

# Save the merged dataframe to a new CSV file
merged_df.to_csv('/Users/hemantg/Documents/merged_final-to-be-used.csv', index=False)

print("CSV files merged successfully and saved as 'merged_csv.csv'")


CSV files merged successfully and saved as 'merged_csv.csv'
