In [1]:
import pandas as pd
import os
import math

def sample_csv_to_size(input_path, output_path, target_size_mb=20, seed=42):
    """
    Sample rows from a CSV file to create a new CSV of approximately target_size_mb.
    
    Parameters:
    input_path (str): Path to input CSV file
    output_path (str): Path where the sampled CSV will be saved
    target_size_mb (float): Desired size of output file in megabytes
    seed (int): Random seed for reproducibility
    
    Returns:
    tuple: (actual_size_mb, sample_fraction)
    """
    # Get original file size in MB
    original_size_mb = os.path.getsize(input_path) / (1024 * 1024)
    
    # If file is already smaller than target, just copy it
    if original_size_mb <= target_size_mb:
        pd.read_csv(input_path).to_csv(output_path, index=False)
        return original_size_mb, 1.0
    
    # Calculate initial sampling fraction
    sample_fraction = target_size_mb / original_size_mb
    
    # Read and sample the data
    df = pd.read_csv(input_path)
    sampled_df = df.sample(frac=sample_fraction, random_state=seed)
    
    # Save sampled data
    sampled_df.to_csv(output_path, index=False)
    
    # Get actual size of output file
    actual_size_mb = os.path.getsize(output_path) / (1024 * 1024)
    
    # Adjust sampling if we're off by more than 10%
    if abs(actual_size_mb - target_size_mb) / target_size_mb > 0.1:
        # Recalculate sampling fraction based on actual compression ratio
        compression_ratio = actual_size_mb / (original_size_mb * sample_fraction)
        new_sample_fraction = (target_size_mb / original_size_mb) / compression_ratio
        
        # Sample again with adjusted fraction
        sampled_df = df.sample(frac=new_sample_fraction, random_state=seed)
        sampled_df.to_csv(output_path, index=False)
        actual_size_mb = os.path.getsize(output_path) / (1024 * 1024)
        return actual_size_mb, new_sample_fraction
    
    return actual_size_mb, sample_fraction

In [2]:
csv = pd.read_csv('/Users/kat.chua/Downloads/inspections-citations.csv')
csv.head()

Unnamed: 0,hash_id,code,kind,repeat,desc,narrative
0,c14125ba5346e5c4,3.125(a),,False,"FACILITIES, GENERAL.","At the south farm, a goat was observed to jump..."
1,090414fb43ad755a,2.33(b)(2),,False,ATTENDING VETERINARIAN AND ADEQUATE VETERINARY...,"Two bottles of Heparin (one partially used, on..."
2,553396bb9bd960ea,2.33(b)(2),,False,ATTENDING VETERINARIAN AND ADEQUATE VETERINARY...,One expired bottle of Isoflourane (expiration ...
3,6f536be1f760dfb6,3.125(a),Critical,False,"Facilities, general.",According to facility observational and/or hea...
4,d8fb5331fdf6ef92,2.32(a),,False,PERSONNEL QUALIFICATIONS.,"In August 2015, a cynomolgus macaque placed un..."


In [3]:
sample_csv_to_size('/Users/kat.chua/Downloads/inspections-citations.csv', '/Users/kat.chua/Downloads/inspections-citations_smaller.csv', target_size_mb=20, seed=42)

(20.023454666137695, 0.5590170904754896)