In [9]:
import pandas as pd
import numpy as np

file_name = 'Population-EstimatesCSV.csv'
df = pd.read_csv(file_name)

target_countries = [
    'Malaysia', 'Morocco', 'Nicaragua', 'Nigeria', 'Poland',
    'Sierra Leone', 'Singapore'
]
indicator_name = 'Population, total'

df_filtered = df[
    (df['Indicator Name'] == indicator_name) &
    (df['Country Name'].isin(target_countries))
].copy()

columns_to_keep = ['Country Name', '2008', '2018']
df_final = df_filtered[columns_to_keep].copy()

df_final.rename(columns={'2008': 'Start', '2018': 'End'}, inplace=True)

df_final['Start'] = pd.to_numeric(df_final['Start'], errors='coerce')
df_final['End'] = pd.to_numeric(df_final['End'], errors='coerce')
df_final.dropna(subset=['Start', 'End'], inplace=True)

df_final['Abs_Change'] = df_final['End'] - df_final['Start']

df_final['Pct_Change'] = np.where(
    df_final['Start'] != 0,
    100 * (df_final['Abs_Change'] / df_final['Start']),
    np.nan
)

def label_change(pct_change):
    if pd.isna(pct_change):
        return 'unknown'
    elif pct_change < 0:
        return 'decline'
    elif 0 <= pct_change <= 10:
        return 'slow'
    elif 10 < pct_change <= 30:
        return 'moderate'
    elif pct_change > 30:
        return 'fast'
    else:
        return 'unknown'

df_final['Label'] = df_final['Pct_Change'].apply(label_change)

df_sorted = df_final.sort_values(by='Pct_Change', ascending=True).reset_index(drop=True)

df_output = df_sorted[['Country Name', 'Start', 'End', 'Abs_Change', 'Pct_Change', 'Label']]

def format_large_number(x):
    if pd.isna(x):
        return x
    return '{:,.0f}'.format(x)

df_output['Start'] = df_output['Start'].apply(format_large_number)
df_output['End'] = df_output['End'].apply(format_large_number)
df_output['Abs_Change'] = df_output['Abs_Change'].apply(format_large_number)
df_output['Pct_Change'] = df_output['Pct_Change'].round(2).astype(str) + '%'

output_file_name = 'population_change_analysis.csv'
df_output.to_csv(output_file_name, index=False)

In [16]:
import pandas as pd
import re

# Load the dataset
file_name = 'Population-EstimatesCSV.csv'
try:
    df = pd.read_csv(file_name)
except FileNotFoundError:
    # Handle case where file might not be accessible in the current environment
    print(f"Error: The file '{file_name}' was not found.")
    # Fallback/placeholder data structure if file load fails for demonstration
    # In a real environment, this part would rely on the file loading successfully.
    # Since we are in a code interpreter, we assume the file load is correct.
    pass

# Extract unique country names
country_names_raw = df['Country Name'].unique()

# --- Step 1: Clean country names ---
country_names_cleaned = {name.strip().title() for name in country_names_raw}
country_names_sorted = sorted(list(country_names_cleaned))

# Function to get count and first N names
def get_summary(country_list, n=5):
    """Returns the count and the first N names from a list."""
    count = len(country_list)
    first_n = country_list[:n]
    return count, first_n

# --- Filtering Logic ---

# 1. Countries that contain "United" (ignore case)S
united_countries = [
    name for name in country_names_sorted
    if "united" in name.lower()
]
count_united, first_5_united = get_summary(united_countries)

# 2. Countries that start with "S"
s_countries = [
    name for name in country_names_sorted
    if name.startswith("S")
]
count_s, first_5_s = get_summary(s_countries)

# 3. Countries that end with "stan" (ignore case)
stan_countries = [
    name for name in country_names_sorted
    if name.lower().endswith("stan")
]
count_stan, first_5_stan = get_summary(stan_countries)

# Prepare the output dictionary
results = {
    "United": {
        "count": count_united,
        "names": first_5_united
    },
    "Starts with S": {
        "count": count_s,
        "names": first_5_s
    },
    "Ends with stan": {
        "count": count_stan,
        "names": first_5_stan
    }
}

print("Countries containing \"United\" (ignore case)")
print(f"Count: {results['United']['count']}")
print(f"First 5 Names: {results['United']['names']}\n")
print("Countries starting with \"S\"")
print(f"Count: {results['Starts with S']['count']}")
print(f"First 5 Names: {results['Starts with S']['names']}\n")
print("Countries ending with \"stan\"")
print(f"Count: {results['Ends with stan']['count']}")
print(f"First 5 Names: {results['Ends with stan']['names']}")
print("\n")

Countries containing "United" (ignore case)
Count: 3
First 5 Names: ['United Arab Emirates', 'United Kingdom', 'United States']

Countries starting with "S"
Count: 33
First 5 Names: ['Samoa', 'San Marino', 'Sao Tome And Principe', 'Saudi Arabia', 'Senegal']

Countries ending with "stan"
Count: 7
First 5 Names: ['Afghanistan', 'Kazakhstan', 'Middle East, North Africa, Afghanistan & Pakistan', 'Pakistan', 'Tajikistan']




In [1]:
import pandas as pd
import numpy as np

# --- Configuration ---
file_name = 'Population-EstimatesCSV.csv'
indicator_name = 'Population, total'
target_year = '2018'
threshold = 1_000_000_000  # Threshold of 1 billion

# --- Final Simplified List of Geographical Aggregates ---
regional_aggregates_final = [
    'East Asia & Pacific',
    'Europe & Central Asia',
    'Latin America & Caribbean',
    'Middle East & North Africa',
    'North America',
    'South Asia',
    'Sub-Saharan Africa',
    'Africa Eastern and Southern',
    'Africa Western and Central',
    'Arab World',
    'Euro area'
]

# --- Processing ---
try:
    df = pd.read_csv(file_name)
except FileNotFoundError:
    print(f"Error: The file '{file_name}' was not found.")
    exit()

# Filter for the total population indicator and the target year column
df_pop_year = df[df['Indicator Name'] == indicator_name][['Country Name', target_year]].copy()

# Filter for the final simplified regions list and rename columns
df_regions = df_pop_year[df_pop_year['Country Name'].isin(regional_aggregates_final)].copy()
df_regions.rename(columns={'Country Name': 'Region', target_year: 'Total_Pop'}, inplace=True)

# Convert population to numeric and drop any non-numeric rows
df_regions['Total_Pop'] = pd.to_numeric(df_regions['Total_Pop'], errors='coerce')
df_regions.dropna(subset=['Total_Pop'], inplace=True)

# Apply labeling logic (Decisions & Loops)
df_regions['Label'] = df_regions['Total_Pop'].apply(
    lambda x: 'Large' if x >= threshold else 'Small'
)

# Sort totals highest first and keep the top five (5)
df_sorted = df_regions.sort_values(by='Total_Pop', ascending=False).head(5)

# Format the Total_Pop column for final display
df_sorted['Total_Pop_Formatted'] = df_sorted['Total_Pop'].apply(lambda x: '{:,.0f}'.format(x))

# Final output DataFrame
df_output = df_sorted[['Region', 'Total_Pop_Formatted', 'Label']].rename(columns={'Total_Pop_Formatted': 'Total_Pop'})

# --- Custom Fixed-Width Table Printing (Improved Readability) ---
print("\n--- Final Analysis Table (Section 4 Output) ---")
W_REGION = 30
W_POP = 14
W_LABEL = 10
SEP = " | "

def print_row(region, total_pop, label):
    row_str = (
        f"| {region:<{W_REGION}}"
        f"{SEP}{total_pop:>{W_POP}}"
        f"{SEP}{label:<{W_LABEL}} |"
    )
    print(row_str)

total_width = W_REGION + W_POP + W_LABEL + 6
line = "=" * total_width
print(line)
print_row("Region", "Total_Pop", "Label")
print(line)

for index, row in df_output.iterrows():
    label_val = f"{row['Label']}"
    print_row(row['Region'], row['Total_Pop'], label_val)

print(line)


--- Final Analysis Table (Section 4 Output) ---
| Region                         |      Total_Pop | Label      |
| East Asia & Pacific            |  2,347,576,325 | Large      |
| South Asia                     |  1,589,164,714 | Large      |
| Sub-Saharan Africa             |  1,109,997,000 | Large      |
| Europe & Central Asia          |    918,947,897 | Small      |
| Africa Eastern and Southern    |    657,801,085 | Small      |
