# Script to Split CSV by record count

### Update patch with file and run to get count of a CSV

In [4]:
import csv

def count_rows(file_path):
    """
    Counts the number of rows in a CSV file.

    :param file_path: Path to the CSV file.
    :return: Total number of rows in the CSV (excluding the header).
    """
    with open(file_path, 'r', newline='', encoding='utf-8') as csv_file:
        reader = csv.reader(csv_file)
        header = next(reader)  # Skip the header row
        row_count = sum(1 for _ in reader)
    return row_count

# Example usage
input_csv = '/Users/jlcavazos/Learn Python/job_related/python_test_big_file.csv'  # Replace with your CSV file path
total_rows = count_rows(input_csv)
print(f"The CSV file '{input_csv}' contains {total_rows} rows (excluding the header).")


The CSV file '/Users/jlcavazos/Learn Python/job_related/python_test_big_file.csv' contains 49555 rows (excluding the header).


### Update large csv file path
### Count of records split
### Path of folder where split will be dumped 

In [3]:
import csv
import os

def split_csv(file_path, row_limit, output_dir):
    """
    Splits a CSV file into smaller files with a specified number of rows.

    :param file_path: Path to the input CSV file.
    :param row_limit: Number of rows per split file.
    :param output_dir: Directory where the split files will be saved.
    """
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Open the input CSV file
    with open(file_path, 'r', newline='', encoding='utf-8') as csv_file:
        reader = csv.reader(csv_file)
        header = next(reader)  # Read the header row
        
        file_count = 1
        rows = []

        for row in reader:
            rows.append(row)
            if len(rows) == row_limit:
                # Write to a new split file when row_limit is reached
                output_file = os.path.join(output_dir, f'split_{file_count}.csv')
                with open(output_file, 'w', newline='', encoding='utf-8') as split_file:
                    writer = csv.writer(split_file)
                    writer.writerow(header)  # Write the header
                    writer.writerows(rows)  # Write the rows
                print(f"Created {output_file} with {len(rows)} rows.")
                
                file_count += 1
                rows = []

        # Write any remaining rows to a final split file
        if rows:
            output_file = os.path.join(output_dir, f'split_{file_count}.csv')
            with open(output_file, 'w', newline='', encoding='utf-8') as split_file:
                writer = csv.writer(split_file)
                writer.writerow(header)
                writer.writerows(rows)
            print(f"Created {output_file} with {len(rows)} rows.")

# Example usage
input_csv = '/Users/jlcavazos/Learn Python/job_related/python_test_big_file.csv'  # Replace with your large CSV file
row_limit = 50000  # Number of rows per split file
output_directory = '/Users/jlcavazos/Learn Python/job_related/split_folder'  # Directory to save the split files

split_csv(input_csv, row_limit, output_directory)


Created /Users/jlcavazos/Learn Python/job_related/split_folder/split_1.csv with 49555 rows.


# Draft 2

In [7]:
import csv
import os

def split_csv(input_file, output_dir, rows_per_file):
    """
    Splits a large CSV file into smaller CSVs, including the header row from the first file in each split.

    :param input_file: Path to the input CSV file.
    :param output_dir: Directory to save the split CSV files.
    :param rows_per_file: Number of rows in each split file (excluding the header row).
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    try:
        with open(input_file, 'r') as infile:
            reader = csv.reader(infile)
            header = next(reader)  # Read the header row
            
            file_count = 1
            rows_written = 0
            
            outfile = None
            writer = None
            
            for row in reader:
                # Create a new output file when necessary
                if rows_written == 0:
                    if outfile:
                        outfile.close()
                    outfile_path = os.path.join(output_dir, f"split_part{file_count}.csv")
                    outfile = open(outfile_path, 'w', newline='')
                    writer = csv.writer(outfile)
                    writer.writerow(header)  # Write the header row
                    file_count += 1
                
                writer.writerow(row)
                rows_written += 1
                
                # Check if the current file reached the limit
                if rows_written == rows_per_file:
                    rows_written = 0
            
            # Close the last file
            if outfile:
                outfile.close()
        
        print(f"CSV file split into {file_count - 1} parts successfully. Files saved in: {output_dir}")
    
    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_csv_file = "/Users/jlcavazos/Learn Python/job_related/python_test_big_file.csv"  # Replace with the path to your input file
output_directory = "/Users/jlcavazos/Learn Python/job_related/split_folder"  # Replace with your desired output directory
rows_per_split = 5000  # Adjust the number of rows per split (excluding header)

split_csv(input_csv_file, output_directory, rows_per_split)


CSV file split into 10 parts successfully. Files saved in: /Users/jlcavazos/Learn Python/job_related/split_folder
