In [1]:
import os
import pandas as pd
import chardet


In [2]:
def detect_encoding(file_path):
    """
    Detects the encoding of a file.

    Args:
        file_path (str): The path to the file.

    Returns:
        str: The encoding of the file.
    """
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read())
    return result["encoding"]


def remove_invalid_codec_rows(directory, output_directory="cleaned_data"):
    """
    Removes rows with invalid codec from all CSV files in a directory, attempting multiple encodings.

    Args:
        directory (str): The path to the directory containing the CSV files.
        output_directory (str): The path to the directory where cleaned files will be saved. Defaults to 'cleaned_data'.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    encodings_to_try = [
        "utf-8",
        "latin1",
        "ISO-8859-1",
        "cp1252",
        "Windows-1252",  # Common for Western European languages
        "ASCII",  # Basic text encoding
        "UTF-16",  # Used in some cases for extended character sets
    ]

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = None  # Initialize df outside the try block
            for encoding in encodings_to_try:
                try:
                    df = pd.read_csv(filepath, encoding=encoding)
                    print(f"Successfully read {filename} with encoding: {encoding}")
                    break  # If successful, break the inner loop and proceed
                except UnicodeDecodeError:
                    print(f"Failed to decode {filename} with {encoding}.")
                except Exception as e:
                    print(f"An unexpected error occurred: {e}")
                    break  # Break the inner loop on unexpected errors

            if df is not None:
                # Save the cleaned file to the output directory
                output_filepath = os.path.join(output_directory, filename)
                try:
                    df.to_csv(
                        output_filepath, index=False, encoding="utf-8"
                    )  # Always save as UTF-8
                    print(
                        f"Removed invalid codec rows from: {filename} and saved to {output_filepath}"
                    )
                except Exception as e:
                    print(f"Error saving {filename}: {e}")
            else:
                print(f"Could not decode {filename} with any of the tried encodings.")


remove_invalid_codec_rows("renamed_data")


Successfully read Aaditya Raj.csv with encoding: utf-8
Removed invalid codec rows from: Aaditya Raj.csv and saved to cleaned_data\Aaditya Raj.csv
Successfully read Abhishek Singh.csv with encoding: utf-8
Removed invalid codec rows from: Abhishek Singh.csv and saved to cleaned_data\Abhishek Singh.csv
Successfully read Aditya Singh.csv with encoding: utf-8
Removed invalid codec rows from: Aditya Singh.csv and saved to cleaned_data\Aditya Singh.csv
Successfully read Afzal Raza.csv with encoding: utf-8
Removed invalid codec rows from: Afzal Raza.csv and saved to cleaned_data\Afzal Raza.csv
Successfully read Ajay Jatav.csv with encoding: utf-8
Removed invalid codec rows from: Ajay Jatav.csv and saved to cleaned_data\Ajay Jatav.csv
Successfully read Ajit Yadav.csv with encoding: utf-8
Removed invalid codec rows from: Ajit Yadav.csv and saved to cleaned_data\Ajit Yadav.csv
Successfully read Akanksha Kushwaha.csv with encoding: utf-8
Removed invalid codec rows from: Akanksha Kushwaha.csv and s

In [3]:
def remove_blank_rows(directory):
    """
    Removes blank rows from all CSV files in a directory, modifying the files in place.

    Args:
        directory (str): The path to the directory containing the CSV files.
    """
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath)
                df.dropna(
                    how="all", inplace=True
                )  # Remove rows where all values are NaN

                # Save the cleaned file, overwriting the original
                df.to_csv(filepath, index=False)
                print(f"Removed blank rows from: {filename}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")


# Use the cleaned_data directory
remove_blank_rows("cleaned_data")


Removed blank rows from: Aaditya Raj.csv
Removed blank rows from: Abhishek Singh.csv
Removed blank rows from: Aditya Singh.csv
Removed blank rows from: Afzal Raza.csv
Removed blank rows from: Ajay Jatav.csv
Removed blank rows from: Ajit Yadav.csv
Removed blank rows from: Akanksha Kushwaha.csv
Removed blank rows from: Alok Raj.csv
Removed blank rows from: Aman Adarsh.csv
Removed blank rows from: Aman Singh.csv
Removed blank rows from: Aman Verma.csv
Removed blank rows from: Amit Kumar.csv
Removed blank rows from: Anamika Kumari.csv
Removed blank rows from: Anand Pandey.csv
Removed blank rows from: Anand Singh.csv
Removed blank rows from: Anoop Kumar.csv
Removed blank rows from: Anshu Kumar.csv
Removed blank rows from: Anuradha Tiwari.csv
Removed blank rows from: Anushri Mishra.csv
Removed blank rows from: Aradhya Patel.csv
Removed blank rows from: Arjun Kadam.csv
Removed blank rows from: Arpita Tripathi.csv
Removed blank rows from: Arun Singh.csv
Removed blank rows from: Aryan Saini.csv

In [4]:
def remove_rows_before_header(directory, header_name="First Name"):
    """
    Removes all rows before the row containing the specified header in each CSV file in a directory.

    Args:
        directory (str): The path to the directory containing the CSV files.
        header_name (str): The name of the column to search for in the header row.  Defaults to 'First Name'.
    """
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(
                    filepath, header=None
                )  # Read without a header initially

                # Find the row containing the header
                header_row_index = None
                for i, row in df.iterrows():
                    if header_name in row.values:
                        header_row_index = i
                        break

                if header_row_index is not None:
                    # Set the header and remove rows before it
                    df.columns = df.iloc[header_row_index]
                    df = df[header_row_index + 1 :]
                    df.columns.name = None  # Remove the name of the columns
                    df.reset_index(drop=True, inplace=True)

                    df.to_csv(filepath, index=False)  # Save back to the same file
                    print(f"Removed rows before header from: {filename}")
                else:
                    print(f"Header '{header_name}' not found in: {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")


# Use the destination_directory variable, which is 'renamed_data'
remove_rows_before_header("cleaned_data")


Removed rows before header from: Aaditya Raj.csv
Removed rows before header from: Abhishek Singh.csv
Removed rows before header from: Aditya Singh.csv
Removed rows before header from: Afzal Raza.csv
Removed rows before header from: Ajay Jatav.csv
Removed rows before header from: Ajit Yadav.csv
Removed rows before header from: Akanksha Kushwaha.csv
Removed rows before header from: Alok Raj.csv
Removed rows before header from: Aman Adarsh.csv
Removed rows before header from: Aman Singh.csv
Removed rows before header from: Aman Verma.csv
Removed rows before header from: Amit Kumar.csv
Removed rows before header from: Anamika Kumari.csv
Removed rows before header from: Anand Pandey.csv
Removed rows before header from: Anand Singh.csv
Removed rows before header from: Anoop Kumar.csv
Removed rows before header from: Anshu Kumar.csv
Removed rows before header from: Anuradha Tiwari.csv
Removed rows before header from: Anushri Mishra.csv
Removed rows before header from: Aradhya Patel.csv
Remove

In [5]:
def remove_extra_columns(directory):
    """
    Removes all columns except 'First Name', 'Last Name', and 'Company' from all CSV files in a directory.

    Args:
        directory (str): The path to the directory containing the CSV files.
    """
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath)
                # Select only the desired columns, handling potential missing columns
                columns_to_keep = ["First Name", "Last Name", "Company"]
                existing_columns = [col for col in columns_to_keep if col in df.columns]

                df = df[existing_columns]

                df.to_csv(filepath, index=False)  # Save back to the same file
                print(f"Removed extra columns from: {filename}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")


# Use the destination_directory variable, which is 'renamed_data'
remove_extra_columns("cleaned_data")


Removed extra columns from: Aaditya Raj.csv
Removed extra columns from: Abhishek Singh.csv
Removed extra columns from: Aditya Singh.csv
Removed extra columns from: Afzal Raza.csv
Removed extra columns from: Ajay Jatav.csv
Removed extra columns from: Ajit Yadav.csv
Removed extra columns from: Akanksha Kushwaha.csv
Removed extra columns from: Alok Raj.csv
Removed extra columns from: Aman Adarsh.csv
Removed extra columns from: Aman Singh.csv
Removed extra columns from: Aman Verma.csv
Removed extra columns from: Amit Kumar.csv
Removed extra columns from: Anamika Kumari.csv
Removed extra columns from: Anand Pandey.csv
Removed extra columns from: Anand Singh.csv
Removed extra columns from: Anoop Kumar.csv
Removed extra columns from: Anshu Kumar.csv
Removed extra columns from: Anuradha Tiwari.csv
Removed extra columns from: Anushri Mishra.csv
Removed extra columns from: Aradhya Patel.csv
Removed extra columns from: Arjun Kadam.csv
Removed extra columns from: Arpita Tripathi.csv
Removed extra 

In [6]:
def sort_by_first_name(directory):
    """
    Sorts all CSV files in a directory by the 'First Name' column.

    Args:
        directory (str): The path to the directory containing the CSV files.
    """
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath)
                # Check if 'First Name' column exists
                if "First Name" in df.columns:
                    df.sort_values(by="First Name", inplace=True)
                    df.to_csv(filepath, index=False)  # Save back to the same file
                    print(f"Sorted {filename} by 'First Name'")
                else:
                    print(f"'First Name' column not found in {filename}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")


sort_by_first_name("cleaned_data")


Sorted Aaditya Raj.csv by 'First Name'
Sorted Abhishek Singh.csv by 'First Name'
Sorted Aditya Singh.csv by 'First Name'
Sorted Afzal Raza.csv by 'First Name'
Sorted Ajay Jatav.csv by 'First Name'
Sorted Ajit Yadav.csv by 'First Name'
Sorted Akanksha Kushwaha.csv by 'First Name'
Sorted Alok Raj.csv by 'First Name'
Sorted Aman Adarsh.csv by 'First Name'
Sorted Aman Singh.csv by 'First Name'
Sorted Aman Verma.csv by 'First Name'
Sorted Amit Kumar.csv by 'First Name'
Sorted Anamika Kumari.csv by 'First Name'
Sorted Anand Pandey.csv by 'First Name'
Sorted Anand Singh.csv by 'First Name'
Sorted Anoop Kumar.csv by 'First Name'
Sorted Anshu Kumar.csv by 'First Name'
Sorted Anuradha Tiwari.csv by 'First Name'
Sorted Anushri Mishra.csv by 'First Name'
Sorted Aradhya Patel.csv by 'First Name'
Sorted Arjun Kadam.csv by 'First Name'
Sorted Arpita Tripathi.csv by 'First Name'
Sorted Arun Singh.csv by 'First Name'
Sorted Aryan Saini.csv by 'First Name'
Sorted Ashwin Yadav.csv by 'First Name'
Sorted 