In [None]:
import pandas as pd
import gc

# Define chunk size
chunk_size = 100000  # Adjust this based on your memory

# Loop through the years 1970 to 1988
for year in range(70, 89):
    file_path = f'/content/drive/MyDrive/Current Research/Water/birth-raw/natalityus19{year}.dta'

    # Create an empty dataframe to store filtered chunks
    filtered_df = pd.DataFrame()

    # Process the file in chunks
    for chunk in pd.read_stata(file_path, chunksize=chunk_size):
        # Filter the chunk where 'stateres' equals "16"
        filtered_chunk = chunk[chunk['stateres'] == "16"]

        # Add the 'date' column with the current year
        filtered_chunk['yr'] = 1900 + year  # This will create 'date' like 1970, 1971, etc.

        # Concatenate filtered chunk to the overall dataframe
        filtered_df = pd.concat([filtered_df, filtered_chunk], ignore_index=True)

    # Save the filtered data to a CSV file
    filtered_df.to_csv(f'/content/drive/MyDrive/Current Research/Water/birth-raw/natalityus19{year}_filtered.csv', index=False)

    # Clear memory
    del filtered_df
    gc.collect()


In [None]:
# Once all individual files are saved, concatenate them into a single dataframe
df_list = []

# Loop through the filtered files to concatenate with optimized dtypes
for year in range(70, 89):
    # Read the filtered CSV file for each year with optimized dtype
    df_year = pd.read_csv(
        f'/content/drive/MyDrive/Current Research/Water/birth-raw/natalityus19{year}_filtered.csv',
        dtype={'stateres': 'category'}  # Optimize memory usage by defining dtype for categorical column
    )

    # Append to the list
    df_list.append(df_year)

    # Clear the dataframe from memory after appending
    del df_year
    gc.collect()  # Call garbage collection to free up memory

# Concatenate all dataframes into a single dataframe
df_7088 = pd.concat(df_list, ignore_index=True)

# Save the final concatenated dataframe
df_7088.to_csv('/content/drive/MyDrive/Current Research/Water/birth-raw/natalityus1970-1988.csv', index=False)

# Display the shape of the final dataframe
print(f"Shape of df_7088: {df_7088.shape}")
