### Data: Population 1970-1980

- Convert txt to CSV

In [None]:
"""
import csv

txt_file = '../datasets/original/1970-1980_population.txt'
csv_file = '../datasets/original/1970-1980_population.csv'

with open(txt_file, 'r') as file:
    lines = file.readlines()

# Extracting header and data rows
header = [header.strip() for header in lines[0].split()]
data_rows = [row.strip().split() for row in lines[1:]]

# Renaming "Fip AL" to "State"
header[0] = 'State'

# Modifying State column to keep only "AL"
for row in data_rows:
    row[0] = row[0].strip()

# Modifying year format
for i in range(2, len(header)):
    year = header[i].split('/')[2]
    header[i] = '19' + year

# Writing to CSV file
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(data_rows)

print(f"Dataset transformed and saved as {csv_file}.")
"""

Agrupar idades em 3 grupos
- Jovens (0-29)
- Adultos (30-65)
- Reformados (65+) 

In [51]:
import csv

# Dictionary to store the results per state, age group, and year
estados = {}

# Open the input CSV file
with open('../datasets/original/1970-1980_population_by_state.csv', 'r') as arquivo_csv:
    leitor = csv.reader(arquivo_csv, delimiter=',')
    
    # Ignore the CSV file header
    next(leitor)

    for linha in leitor:
        estado = linha[0]
        idade_range = linha[1].replace('+', '').replace('M', '').replace('F', '')
        valor = int(linha[2])

        # Split the age range and consider the lower bound for grouping
        idade_range_parts = idade_range.split('-')
        idade = int(idade_range_parts[0])

        # Group the ages into three categories: 0-29, 30-65, 65+
        if 0 <= idade <= 29:
            idade_grupo = 'jovens'
        elif 30 <= idade < 65:
            idade_grupo = 'adultos'
        else:
            idade_grupo = 'reformados'

        # Check if the state already exists in the dictionary
        if estado in estados:
            # Check if the age group exists for the state
            if idade_grupo in estados[estado]:
                # Update the value for the corresponding year
                if len(linha) > 2:
                    for i, ano in enumerate(range(1970, 1981)):
                        if ano in estados[estado][idade_grupo]:
                            estados[estado][idade_grupo][ano] += int(linha[i + 2])
                        else:
                            estados[estado][idade_grupo][ano] = int(linha[i + 2])
            else:
                # Create a new dictionary for the age group and add the value for each year
                estados[estado][idade_grupo] = {}
                if len(linha) > 2:
                    for i, ano in enumerate(range(1970, 1981)):
                        estados[estado][idade_grupo][ano] = int(linha[i + 2])
        else:
            # Create a new dictionary for the state and add the age group and value for each year
            estados[estado] = {idade_grupo: {}}
            if len(linha) > 2:
                for i, ano in enumerate(range(1970, 1981)):
                    estados[estado][idade_grupo][ano] = int(linha[i + 2])

# Write the dictionaries to a CSV file
output_file = '../datasets/1970-1980_population_by_state.csv'
with open(output_file, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['State', 'Age', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980'])
    
    for estado, faixas_etarias in estados.items():
        for faixa_etaria, anos in faixas_etarias.items():
            if isinstance(anos, int):  # Handle the case when anos is an integer
                writer.writerow([estado, faixa_etaria, anos])
            else:
                writer.writerow([estado, faixa_etaria, *anos.values()])
        
        # Calculate the sum for each year
        sum_by_year = [sum(anos[ano] for anos in faixas_etarias.values()) for ano in range(1970, 1981)]
        writer.writerow([estado, 'all', *sum_by_year])


### Data: Population 1970-1990 

In [32]:
import pandas as pd

# Read data from CSV file
df = pd.read_csv('../datasets/original/pop7099s.csv')

# Delete the "fips" column
df = df.drop("fips", axis=1)

# Define the age group categories
age_groups = {
    "Kids": (0, 29),
    "Adults": (30, 64),
    "Retired": (65, float('inf'))
}

# Function to extract the minimum age from the age group
def extract_min_age(age_group):
    if isinstance(age_group, str) and "-" in age_group:
        age_parts = age_group.split("-")
        min_age = int(age_parts[0])
        return min_age
    else:
        return age_group

# Clean the age group values
df['agegr'] = df['agegr'].str.replace('+', '').replace('M', '').replace('F', '')

# Extract the minimum age from the age group
df['min_age'] = df['agegr'].apply(extract_min_age)

# Convert min_age column to numeric
df['min_age'] = pd.to_numeric(df['min_age'], errors='coerce')

# Aggregate population data for each state, year, and age group
aggregated_data = []
for (state, year), state_year_data in df.groupby(["state", "year"]):
    if 1970 <= year <= 1979:
        for group, age_range in age_groups.items():
            age_min, age_max = age_range
            pop_sum = state_year_data[(state_year_data["min_age"].notnull()) & (state_year_data["min_age"] >= age_min) & (state_year_data["min_age"] <= age_max)]["pop"].sum()
            aggregated_data.append([state, year, group, pop_sum])
    else:
        for group, age_range in age_groups.items():
            age_min, age_max = age_range
            pop_sum = state_year_data[(state_year_data["age"] >= age_min) & (state_year_data["age"] <= age_max)]["pop"].sum()
            aggregated_data.append([state, year, group, pop_sum])

# Create a new DataFrame with the aggregated data
aggregated_df = pd.DataFrame(aggregated_data, columns=["state", "year", "age_group", "population"])

# Save the aggregated DataFrame to a CSV file
aggregated_df.to_csv("../datasets/population70-99.csv", index=False)


  df = pd.read_csv('../datasets/original/pop7099s.csv')
