### Data: Population 1970-1980

Agrupar idades em 3 grupos
- Jovens (0-29)
- Adultos (30-65)
- Reformados (65+) 

In [1]:
"""
import pandas as pd

# Read data from CSV file
df = pd.read_csv('../datasets/original/pop7099s.csv')

# Rename the "fips" column to "state_id"
df = df.rename(columns={"fips": "state_id"})

# Define the age group categories
age_groups = {
    "Kids": (0, 29),
    "Adults": (30, 64),
    "Retired": (65, float('inf'))
}

# Function to extract the minimum age from the age group
def extract_min_age(age_group):
    if isinstance(age_group, str) and "-" in age_group:
        age_parts = age_group.split("-")
        min_age = int(age_parts[0])
        return min_age
    else:
        return age_group

# Clean the age group values
df['agegr'] = df['agegr'].str.replace('+', '').replace('M', '').replace('F', '')

# Extract the minimum age from the age group
df['min_age'] = df['agegr'].apply(extract_min_age)

# Convert min_age column to numeric
df['min_age'] = pd.to_numeric(df['min_age'], errors='coerce')

# Aggregate population data for each state, year, and age group
aggregated_data = []
for (state_id, state, year), state_year_data in df.groupby(["state_id", "state", "year"]):
    if 1970 <= year <= 1979:
        for group, age_range in age_groups.items():
            age_min, age_max = age_range
            pop_sum = state_year_data[(state_year_data["min_age"].notnull()) & (state_year_data["min_age"] >= age_min) & (state_year_data["min_age"] <= age_max)]["pop"].sum()
            aggregated_data.append([state_id, state, year, group, pop_sum])
    else:
        for group, age_range in age_groups.items():
            age_min, age_max = age_range
            pop_sum = state_year_data[(state_year_data["age"] >= age_min) & (state_year_data["age"] <= age_max)]["pop"].sum()
            aggregated_data.append([state_id, state, year, group, pop_sum])

    # Add a row with the "all" age group and total population
    all_pop_sum = state_year_data["pop"].sum()
    aggregated_data.append([state_id, state, year, "All", all_pop_sum])

# Calculate the total population for the United States by year
us_population_data = []
for year, year_data in df.groupby("year"):
    us_pop_sum = year_data["pop"].sum()
    us_population_data.append(["US", "United States", year, "All", us_pop_sum])

# Extend the aggregated data with the US population data
aggregated_data.extend(us_population_data)

# Create a new DataFrame with the aggregated data
aggregated_df = pd.DataFrame(aggregated_data, columns=["state_id", "state", "year", "age_group", "population"])
"""

'\nimport pandas as pd\n\n# Read data from CSV file\ndf = pd.read_csv(\'../datasets/original/pop7099s.csv\')\n\n# Rename the "fips" column to "state_id"\ndf = df.rename(columns={"fips": "state_id"})\n\n# Define the age group categories\nage_groups = {\n    "Kids": (0, 29),\n    "Adults": (30, 64),\n    "Retired": (65, float(\'inf\'))\n}\n\n# Function to extract the minimum age from the age group\ndef extract_min_age(age_group):\n    if isinstance(age_group, str) and "-" in age_group:\n        age_parts = age_group.split("-")\n        min_age = int(age_parts[0])\n        return min_age\n    else:\n        return age_group\n\n# Clean the age group values\ndf[\'agegr\'] = df[\'agegr\'].str.replace(\'+\', \'\').replace(\'M\', \'\').replace(\'F\', \'\')\n\n# Extract the minimum age from the age group\ndf[\'min_age\'] = df[\'agegr\'].apply(extract_min_age)\n\n# Convert min_age column to numeric\ndf[\'min_age\'] = pd.to_numeric(df[\'min_age\'], errors=\'coerce\')\n\n# Aggregate population dat

In [2]:
import pandas as pd

# Read data from CSV file
df = pd.read_csv('../datasets/original/pop7099s.csv')

# Rename the "fips" column to "state_id"
df = df.rename(columns={"fips": "state_id"})

# Define the age group categories
age_groups = {
    "Kids": (0, 29),
    "Adults": (30, 64),
    "Retired": (65, float('inf'))
}

# Function to extract the minimum age from the age group
def extract_min_age(age_group):
    if isinstance(age_group, str) and "-" in age_group:
        age_parts = age_group.split("-")
        min_age = int(age_parts[0])
        return min_age
    else:
        return age_group

# Clean the age group values
df['agegr'] = df['agegr'].str.replace('+', '').replace('M', '').replace('F', '')

# Extract the minimum age from the age group
df['min_age'] = df['agegr'].apply(extract_min_age)

# Convert min_age column to numeric
df['min_age'] = pd.to_numeric(df['min_age'], errors='coerce')

# Aggregate population data for each state, year, and age group
aggregated_data = []
for (state_id, state, year), state_year_data in df.groupby(["state_id", "state", "year"]):
        if 1970 <= year <= 1979:
            kids_pop_sum = state_year_data[(state_year_data["min_age"].notnull()) & (state_year_data["min_age"] <= age_groups["Kids"][1])]["pop"].sum()
            adults_pop_sum = state_year_data[(state_year_data["min_age"].notnull()) & (state_year_data["min_age"] >= age_groups["Adults"][0]) & (state_year_data["min_age"] <= age_groups["Adults"][1])]["pop"].sum()
            retired_pop_sum = state_year_data[(state_year_data["min_age"].notnull()) & (state_year_data["min_age"] >= age_groups["Retired"][0])]["pop"].sum()
            all_pop_sum = state_year_data["pop"].sum()
            aggregated_data.append([state_id, state, year, kids_pop_sum, adults_pop_sum, retired_pop_sum, all_pop_sum])
        else:
            kids_pop_sum = state_year_data[(state_year_data["age"].notnull()) & (state_year_data["age"] <= age_groups["Kids"][1])]["pop"].sum()
            adults_pop_sum = state_year_data[(state_year_data["age"].notnull()) & (state_year_data["age"] >= age_groups["Adults"][0]) & (state_year_data["age"] <= age_groups["Adults"][1])]["pop"].sum()
            retired_pop_sum = state_year_data[(state_year_data["age"].notnull()) & (state_year_data["age"] >= age_groups["Retired"][0])]["pop"].sum()
            all_pop_sum = state_year_data["pop"].sum()
            aggregated_data.append([state_id, state, year, kids_pop_sum, adults_pop_sum, retired_pop_sum, all_pop_sum])

# Create a new DataFrame with the aggregated data
aggregated_df = pd.DataFrame(aggregated_data, columns=["state_id", "state", "year", "Kids", "Adults", "Retired", "All"])

  df = pd.read_csv('../datasets/original/pop7099s.csv')


### Data: Population 2000-2010

In [3]:
import pandas as pd

# Load the dataset
df_pop = pd.read_csv('../datasets/original/2000-2010_population_by_state.csv')

# Remove the "REGION" and "DIVISION" columns
df_pop = df_pop.drop(["REGION", "DIVISION", "ESTIMATESBASE2000"], axis=1)

# Replace "United States" with "US"
df_pop["NAME"] = df_pop["NAME"].replace("United States", "US")

# Rename the columns
df_pop = df_pop.rename(columns={"STATE": "state_id", "NAME": "state", "POPESTIMATE2000": "2000",
                                "POPESTIMATE2001": "2001",
                                "POPESTIMATE2002": "2002",
                                "POPESTIMATE2003": "2003",
                                "POPESTIMATE2004": "2004",
                                "POPESTIMATE2005": "2005",
                                "POPESTIMATE2006": "2006",
                                "POPESTIMATE2007": "2007",
                                "POPESTIMATE2008": "2008",
                                "POPESTIMATE2009": "2009",
                                "POPESTIMATE2010": "2010"})

# Define the age group categories
age_groups = {
    "Kids": (0, 29),
    "Adults": (30, 65),
    "Retired": (65, 85)
}
df_pop = df_pop[df_pop["SEX"] == 0]

# Remove the "SEX" column
df_pop = df_pop.drop(["SEX"], axis=1)

# Create a new DataFrame for the aggregated data
aggregated_data = []

# Iterate over each state and year
for state_id, state_df in df_pop.groupby("state_id"):
    for year in range(2000, 2011):
        year_data = {"state_id": state_id, "state": state_df["state"].unique()[0], "year": year}
        # Iterate over each age group
        for group, (lower_age, upper_age) in age_groups.items():
            # Sum the population for the given age group and year
            population = state_df[(state_df["AGE"] >= lower_age) & (state_df["AGE"] <= upper_age)][f"{year}"].sum()
            year_data[group] = population
        # Calculate the total population for the row
        year_data["All"] = state_df[(state_df["AGE"] == 999)][f"{year}"].values[0]
        aggregated_data.append(year_data)

# Create a DataFrame from the aggregated data
df_2000 = pd.DataFrame(aggregated_data)

# Reorder the columns to have "state_id" and "state" next to each other
columns = ["state_id", "state"] + [col for col in df_2000.columns if col not in ["state_id", "state"]]
df_2000 = df_2000[columns]

# Save the updated DataFrame to a new CSV file
df_2000.to_csv("../datasets/population2000-2010.csv", index=False)


### Juntar os dados da população 1970-2010

In [30]:
import csv
# Read the first CSV dataset
#dataset1 = pd.read_csv("../datasets/population70-99.csv")

# Read the second CSV dataset
#dataset2 = pd.read_csv("../datasets/population2000-2010.csv")

# Append dataset2 to dataset1
appended_dataset = pd.concat([aggregated_df, df_2000], ignore_index=True)

state_names = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California",
    "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia",
    "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
    "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
    "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri",
    "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey",
    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
    "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
    "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont",
    "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"
}

# Iterate over the "state" column and replace state abbreviations with full names
appended_dataset['state'] = appended_dataset['state'].replace(state_names)

# Save the appended dataset to a new CSV file
appended_dataset.to_csv("../datasets/US_Population_1970-2010.csv", index=False)

data = {}
with open('../datasets/US_Population_1970-2010.csv', 'r') as file:
    lines = file.readlines()[1:]  # Skip the header line
    for line in lines:
        state_id, state, year, kids, adults, retired, all_people = line.strip().split(',')
        
        if state not in data:
            data[state] = {}
        data[state][year] = {
            'state_id': int(state_id),
            'Kids': int(kids),
            'Adults': int(adults),
            'Retired': int(retired),
            'All': int(all_people)
        }

# Write data to CSV file
with open('../datasets/US_Population.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['state_id','state', 'year', 'kids', 'adults', 'retired', 'all'])
    for state in data:
        for year in data[state]:
            state_id = data[state][year]['state_id']
            kids = data[state][year]['Kids']
            adults = data[state][year]['Adults']
            retired = data[state][year]['Retired']
            all_people = data[state][year]['All']
            writer.writerow([state_id,state, year, kids, adults, retired, all_people])

### Derivar dados em falta sobre os EUA através dos dados existentes dos Estados

In [27]:
import csv

data = {}

# Open the dataset file
filename = '../datasets/US_Population.csv'

with open(filename, mode='r') as file:
    reader = csv.reader(file)
    rows = list(reader)  # Read all the rows

    # Skip the header row
    header = rows[0]
    rows = rows[1:]

    # Read each line in the dataset
    for line in rows:
        state, year, kids, adults, retired, all_people = line

        # Convert numeric values to integers
        year = int(year)
        kids = int(kids)
        adults = int(adults)
        retired = int(retired)
        all_people = int(all_people)

        # Create entries for the state
        if state not in data:
            data[state] = {}

        data[state][str(year)] = {
            'Kids': kids,
            'Adults': adults,
            'Retired': retired,
            'All': all_people
        }

# Create entries for the state "US" for each year between 1970 and 1999
us_data = {}

for year in range(1970, 2000):
    str_year = str(year)
    if str_year not in us_data:
        us_data[str_year] = {
            'Kids': 0,
            'Adults': 0,
            'Retired': 0,
            'All': 0
        }

    for state in data:
        state_data = data[state]
        if str_year in state_data:
            us_data[str_year]['Kids'] += state_data[str_year]['Kids']
            us_data[str_year]['Adults'] += state_data[str_year]['Adults']
            us_data[str_year]['Retired'] += state_data[str_year]['Retired']
            us_data[str_year]['All'] += state_data[str_year]['All']

# Add us_data to the existing dataset file
with open(filename, mode='a', newline='') as file:
    writer = csv.writer(file)
    for year in us_data:
        row = ['US', year, us_data[year]['Kids'], us_data[year]['Adults'], us_data[year]['Retired'], us_data[year]['All']]
        writer.writerow(row)

print("Data added to", filename)


Data added to ../datasets/US_Population.csv
