<a href="https://colab.research.google.com/github/jbloewencolon/Creating-Dataset-for-The-Demographics-of-Faerun/blob/main/data_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
import random
import ast

In [None]:
# File path
df_file_path = '/content/drive/MyDrive/Colab Notebooks/DnD/updated_dataset.csv'
idf_file_path = '/content/drive/MyDrive/Colab Notebooks/DnD/idf_dataset.csv'

# Read the CSV files into a DataFrame
df = pd.read_csv(df_file_path)
idf = pd.read_csv(idf_file_path)

# Write the DataFrames to new CSV files
#df.to_csv(df_file_path, index=False)  # index=False to exclude the index column
#idf.to_csv(idf_file_path, index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 849 entries, 0 to 848
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   settlement                  849 non-null    object 
 1   region                      849 non-null    object 
 2   settlement_population       849 non-null    int64  
 3   settlement_economy          849 non-null    float64
 4   rumored_treasure_value      849 non-null    int64  
 5   demographic_breakdown       849 non-null    object 
 6   average_age                 849 non-null    int64  
 7   most_likely_cause_of_death  849 non-null    object 
 8   government_type             849 non-null    object 
 9   class_density               849 non-null    object 
 10  tax_rate                    849 non-null    int64  
 11  exports                     545 non-null    object 
 12  imports                     523 non-null    object 
 13  military                    441 non

# Data Understanding

In [None]:
df['class_density'].tail()

844    {'Aristocratic': 17, 'Wealthy': 403, 'Comforta...
845    {'Aristocratic': 0, 'Wealthy': 2, 'Comfortable...
846    {'Aristocratic': 0, 'Wealthy': 5, 'Comfortable...
847    {'Aristocratic': 0, 'Wealthy': 18, 'Comfortabl...
848    {'Aristocratic': 0, 'Wealthy': 23, 'Comfortabl...
Name: class_density, dtype: object

In [None]:
# Assuming you have a DataFrame 'df' with columns containing float values
columns_to_convert = ['settlement_population', 'region_population', 'settlement_economy', 'rumored_treasure_value', 'region_economy']

# Fill NaN values with 0 and convert infinity values to a large integer
for column in columns_to_convert:
    df[column] = df[column].fillna(0).replace([np.inf, -np.inf], np.nan).astype(int)

# Use astype(int) to convert specified columns to integers
df[columns_to_convert] = df[columns_to_convert].astype(int)

In [None]:
# Generate random percentages between 3% and 7%
random_percentages = np.random.uniform(0.03, 0.07, size=len(df))

# Randomly choose to add or subtract the percentage
change_factors = 1 + np.random.choice([-1, 1], size=len(df)) * random_percentages

# Apply the change factors to the 'settlement_population' column
df['settlement_population'] = np.rint(np.nan_to_num(df['settlement_population']) * change_factors).astype(int)

# Determing Class Density and Generating Economy

In [None]:
lifestyle_costs = {
    'Wretched': 0,          # No cost
    'Squalid': .1,           # 1 silver piece (sp) per day
    'Poor': .2,              # 2 silver pieces (sp) per day
    'Modest': 1,            # 1 gold piece (gp) per day
    'Comfortable': 2,       # 2 gold pieces (gp) per day
    'Wealthy': 4,           # 4 gold pieces (gp) per day
    'Aristocratic': 10      # 10 gold pieces (gp) minimum per day
}

lifestyle_ranges = {
    'tiny_settlement': {'Aristocratic': (0, 0.0001), 'Wealthy': (0, .05), 'Comfortable': (1, 10), 'Modest': (1, 20), 'Poor': (30, 35), 'Squalid': (10, 30), 'Wretched': (0, 0.001)},
    'small_settlement': {'Aristocratic': (0, .005), 'Wealthy': (.05, 1), 'Comfortable': (4, 7), 'Modest': (7, 20),'Poor': (30, 35), 'Squalid': (15, 30), 'Wretched': (1, 2)},
    'medium_settlement': {'Aristocratic': (0, .05), 'Wealthy': (1, 2), 'Comfortable': (6, 8), 'Modest': (10, 25),'Poor': (30, 35), 'Squalid': (18, 30), 'Wretched': (3, 8)},
    'large_settlement': {'Aristocratic': (0, .5), 'Wealthy': (1, 3), 'Comfortable': (7, 13), 'Modest': (10, 20),'Poor': (30, 35), 'Squalid': (20, 40), 'Wretched': (5, 10)},
    'mega_settlement': {'Aristocratic': (0, .8), 'Wealthy': (2, 3), 'Comfortable': (10, 15), 'Modest': (15, 30),'Poor': (30, 35), 'Squalid': (35, 50), 'Wretched': (8, 10)}
}

In [None]:
def assign_lifestyle_demographics(population):
    total_economy = 0

    # Step 1: Determine settlement_size
    if population < 100:
        city_size = 'tiny_settlement'
    elif 100 <= population < 1000:
        city_size = 'small_settlement'
    elif 1000 <= population < 10000:
        city_size = 'medium_settlement'
    elif 10000 <= population < 100000:
        city_size = 'large_settlement'
    else:
        city_size = 'mega_settlement'

    # Step 2: Calculate class_density and total_economy
    class_density = {}
    for lifestyle, (start, end) in lifestyle_ranges[city_size].items():
        percentage = random.uniform(start, end)
        count = int(population * (percentage / 100))
        class_density[lifestyle] = count
        total_economy += count * lifestyle_costs[lifestyle] * 365

    return class_density, total_economy

# Apply the function to the 'settlement_population' column and directly assign the results to new DataFrame columns
results = df['settlement_population'].apply(assign_lifestyle_demographics)
df['class_density'] = results.apply(lambda x: x[0])
df['settlement_economy'] = results.apply(lambda x: x[1])

# Convert the dictionary in 'class_density' to a string representation if necessary
df['class_density'] = df['class_density'].apply(str)

In [None]:
# Create an empty dictionary to store the total sum of each lifestyle category
total_class_density = {}

# Assuming you have a DataFrame 'df' with columns containing float values
for _, row in df.iterrows():
    class_density, _ = assign_lifestyle_demographics(row['settlement_population'])

    # Add the counts to the total_class_density dictionary
    for lifestyle, count in class_density.items():
        total_class_density[lifestyle] = total_class_density.get(lifestyle, 0) + count

# Print the total sum of each lifestyle category across all settlements
for lifestyle, total_count in total_class_density.items():
    print(f"Total {lifestyle} count: {total_count}")

Total Aristocratic count: 30427
Total Wealthy count: 216312
Total Comfortable count: 1163190
Total Modest count: 2020999
Total Poor count: 3378371
Total Squalid count: 3456607
Total Wretched count: 812652


In [None]:
df['class_density'].tail()

844    {'Aristocratic': 32, 'Wealthy': 350, 'Comforta...
845    {'Aristocratic': 0, 'Wealthy': 4, 'Comfortable...
846    {'Aristocratic': 0, 'Wealthy': 3, 'Comfortable...
847    {'Aristocratic': 0, 'Wealthy': 11, 'Comfortabl...
848    {'Aristocratic': 0, 'Wealthy': 35, 'Comfortabl...
Name: class_density, dtype: object

In [None]:
# Calculate the total economy across all settlements
total_economy = df['settlement_economy'].sum()
print(f"Total Economy: {total_economy}")

# Find the top 10 settlements with the highest settlement economy
top_10_settlements = df.nlargest(10, 'settlement_economy')

# Print the top 10 settlements
print("Top 10 Settlements by Economy:")
print(top_10_settlements[['settlement', 'settlement_economy', 'settlement_population']])


Total Economy: 2312632298.5
Top 10 Settlements by Economy:
        settlement  settlement_economy  settlement_population
764      Waterdeep         318359752.5                1281544
426       Darromar         286637383.5                 948080
478      Thaymount         237367128.5                 882633
186        Elturel          63093352.5                 220794
324       Skyclave          41056222.0                 168838
798        Thindar          40242491.0                 160543
832      Unthalass          40016665.5                 168076
114         Suzail          37615111.5                 135464
351     Skalnaedyr          33931239.5                 120196
676  Baldur's Gate          31273747.5                 114273


In [None]:
# Group by 'Region' and sum up the 'settlement_economy'
region_economy_sum = df.groupby('region')['settlement_economy'].sum().reset_index()

# Rename the columns for easier merging
region_economy_sum.columns = ['region', 'new_region_economy']

# Merge the DataFrame to include the new region economy sums
df = pd.merge(df, region_economy_sum, on='region', how='left')

# Update the 'region_economy' column with the newly computed sums
df['region_economy'] = df['new_region_economy']

# Drop the temporary column used for merging
df.drop(columns=['new_region_economy'], inplace=True)


In [None]:
df['region_economy'].tail()

844    7301752.0
845     727737.0
846     727737.0
847     727737.0
848     727737.0
Name: region_economy, dtype: float64

In [None]:
idf = df.copy()

def determine_settlement_size(population):
    if population < 100:
        return 'tiny_settlement'
    elif 100 <= population < 1000:
        return 'small_settlement'
    elif 1000 <= population < 10000:
        return 'medium_settlement'
    elif 10000 <= population < 100000:
        return 'large_settlement'
    else:
        return 'mega_settlement'

def adjust_settlement_economy(row):
    size_to_divisor = {
        'tiny_settlement': 10000,
        'small_settlement': 1000,
        'medium_settlement': 100,
        'large_settlement': 10,
        'mega_settlement': 1  # No adjustment needed for 'mega_settlement'
    }
    divisor = size_to_divisor.get(row['settlement_size'], 1)  # Default to no adjustment
    return row['settlement_economy'] / divisor

# Create 'settlement_size' column by applying 'determine_settlement_size'
idf['settlement_size'] = df['settlement_population'].apply(determine_settlement_size)

# Adjust the 'settlement_economy' column
idf['settlement_economy'] = idf.apply(adjust_settlement_economy, axis=1)


In [None]:
idf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 849 entries, 0 to 848
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   settlement                  849 non-null    object 
 1   region                      849 non-null    object 
 2   settlement_population       849 non-null    int64  
 3   settlement_economy          849 non-null    float64
 4   rumored_treasure_value      849 non-null    int64  
 5   demographic_breakdown       849 non-null    object 
 6   average_age                 849 non-null    int64  
 7   most_likely_cause_of_death  849 non-null    object 
 8   government_type             849 non-null    object 
 9   class_density               849 non-null    object 
 10  tax_rate                    849 non-null    int64  
 11  exports                     545 non-null    object 
 12  imports                     523 non-null    object 
 13  military                    441 non

# Calculating Hidden Economy

In [None]:
# Convert 'class_density' from string to dictionary using ast.literal_eval safely
idf['class_density_dict'] = idf['class_density'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Create separate columns for each class density for vectorized operations
for lifestyle in ['Squalid', 'Wretched', 'Poor', 'Wealthy', 'Aristocratic']:
    idf[f"{lifestyle}_count"] = idf['class_density_dict'].apply(lambda x: x.get(lifestyle, 0))

# Calculate the multipliers directly using vectorized operations
idf['squalid_multiplier'] = idf['Squalid_count'] * 0.0000005
idf['wretched_multiplier'] = idf['Wretched_count'] * 0.0000003
idf['poor_multiplier'] = idf['Poor_count'] * 0.0000001
idf['wealthy_multiplier'] = idf['Wealthy_count'] * 0.000004
idf['aristocratic_multiplier'] = idf['Aristocratic_count'] * 0.00001

# Calculate hidden_multiplier and hidden_economy in a vectorized way
idf['hidden_multiplier'] = idf[['wealthy_multiplier', 'aristocratic_multiplier', 'squalid_multiplier', 'wretched_multiplier', 'poor_multiplier']].sum(axis=1)
idf['hidden_economy'] = idf.apply(lambda row: row['settlement_economy'] * row['hidden_multiplier'] if row['hidden_multiplier'] != 0 else row['settlement_economy'], axis=1)

# Convert the 'hidden_economy' column to integers
idf['hidden_economy'] = idf['hidden_economy'].fillna(0).astype(int)

# Drop the intermediate columns if they are no longer needed
idf.drop(columns=['class_density_dict', 'Squalid_count', 'Wretched_count', 'Poor_count', 'Wealthy_count', 'Aristocratic_count', 'squalid_multiplier', 'wretched_multiplier', 'poor_multiplier', 'wealthy_multiplier', 'aristocratic_multiplier'], inplace=True)

In [None]:
df['hidden_economy'] = idf['hidden_economy']

In [None]:
# Handle potential division by zero
idf['settlement_population'] = idf['settlement_population'].replace(0, np.nan)

# Calculate hidden economy per capita
idf['hidden_economy_per_capita'] = idf['hidden_economy'] / idf['settlement_population']

# Replace NaN values with 0 (if there were any divisions by zero)
idf['hidden_economy_per_capita'] = idf['hidden_economy_per_capita'].fillna(0)

# Get the top 10 settlements with the highest hidden economy per capita
top_10_hidden_economies = idf.nlargest(10, 'hidden_economy_per_capita')[['settlement', 'hidden_economy_per_capita']]

# Print the top 10 hidden economies per capita
print("Top 10 Settlements with Highest Hidden Economy Per Capita:")
print(top_10_hidden_economies)

Top 10 Settlements with Highest Hidden Economy Per Capita:
        settlement  hidden_economy_per_capita
764      Waterdeep                 136.450217
426       Darromar                 122.643570
478      Thaymount                  89.292798
186        Elturel                  22.421606
324       Skyclave                  16.627187
114         Suzail                  16.188958
832      Unthalass                  15.477611
798        Thindar                  14.532891
351     Skalnaedyr                  13.648433
676  Baldur's Gate                  13.445661


In [None]:
idf['hidden_economy'].head()

0        0
1        0
2        0
3        0
4    10665
Name: hidden_economy, dtype: int64

In [None]:
idf['settlement_economy'] = idf['settlement_economy'].astype(int)

# Calculate the total economy of all settlements
total_economy = idf['settlement_economy'].sum() + idf['hidden_economy'].sum()

print(f"The total economy of all settlements is {total_economy}")


The total economy of all settlements is 1686084948


In [None]:
# Function to adjust hidden economy based on the number of unique imports and exports
def adjust_hidden_economy(row):
    # Get the number of unique exports and imports
    unique_exports = len(set(row['exports'])) if isinstance(row['exports'], list) else 0
    unique_imports = len(set(row['imports'])) if isinstance(row['imports'], list) else 0

    # Calculate adjustment value
    adjustment_value = (unique_exports + unique_imports) / 100 * row['settlement_economy']

    # Update hidden_economy value
    return row['hidden_economy'] + adjustment_value

# Assuming 'hidden_economy' is already initialized. If not, initialize it first.
if 'hidden_economy' not in idf.columns:
    idf['hidden_economy'] = 0.0  # Initialize to 0

# Now adjust the 'hidden_economy' column
idf['hidden_economy'] = idf.apply(adjust_hidden_economy, axis=1)
idf['hidden_economy'] = idf['hidden_economy'].astype(int)

# Demographics

In [None]:
# Define the default population breakdown
default_population_breakdown = {'Humans': 80, 'Halflings': 7, 'Misc': 13}

# Function to calculate demographic_breakdown if it's not already provided
def calculate_demographic_breakdown(row):
    # Proceed only if demographic_breakdown is NaN or empty
    if pd.isna(row['demographic_breakdown']) or not row['demographic_breakdown']:
        population_breakdown = default_population_breakdown
        demographic_breakdown = {
            species: int(row['settlement_population'] * (percentage / 100))
            for species, percentage in population_breakdown.items()
        }
        return demographic_breakdown
    else:
        # Return the existing demographic_breakdown if it's already filled
        return row['demographic_breakdown']

# Apply the function conditionally
df['demographic_breakdown'] = df.apply(lambda row: calculate_demographic_breakdown(row)
                                       if pd.isna(row['demographic_breakdown']) or not row['demographic_breakdown']
                                       else row['demographic_breakdown'], axis=1)

In [None]:
# Calculate the total economy of all settlements
total_population = df['settlement_population'].sum()
print(f"The total population of all settlements is {total_population}")


The total population of all settlements is 10341403


# Average Age

In [None]:
#Define Constants
species = ["humans", "halflings", "half-orcs", "half-drow", "half-elves", "elves", "dwarves", "gnomes", "goblins", "lizardfolk", "yuan-ti", "orcs", "ogres", "asabi", "gnolls",
           "pterafolk", "drow", "centaurs", "wemics", "humanoids"]

species_age_ranges = {
    "humans": [1, 100],
    "halflings": [1, 150],
    "half-elves": [1, 180],
    "half-orcs": [1, 75],
    "orcs": [1, 50],
    "elves": [1, 750],
    "dwarves": [1, 350],
    "gnomes": [1, 400],
    "goblins": [1, 60],  # Goblins generally have short lifespans.
    "lizardfolk": [1, 80],  # Lizardfolk have lifespans comparable to or slightly longer than humans.
    "yuan-ti": [1, 120],  # Yuan-ti have longer lifespans due to their snake-like nature.
    "ogres": [1, 100],  # Ogres have lifespans similar to humans but can occasionally live longer.
    "asabi": [1, 60],  # Also known as 'ashworms', Asabis have shorter lifespans.
    "gnolls": [1, 30],  # Gnolls have relatively short lifespans due to their chaotic and violent lives.
    "pterafolk": [1, 60],  # Pterafolk have lifespans similar to other humanoid species.
    "drow": [1, 750],  # Drow, or dark elves, have lifespans similar to other elves.
    "centaurs": [1, 120],  # Centaurs tend to live longer than humans but not as long as elves.
    "wemics": [1, 60],  # Wemics have lifespans similar to or slightly longer than humans.
    "half-drow": [1, 180],  # Half-drow might have lifespans in between humans and drow.
    "humanoids": [1, 80],  # Generic humanoid lifespan, individual species may vary.
    "misc.": [1, 80],  # Generic humanoid lifespan, individual species may vary.
}

# Lifestyle modifiers
lifestyle_modifier= {
    'Wretched': -80,
    'Squalid': -30,
    'Poor': -10,
    'Modest': 0,
    'Comfortable': 3,
    'Wealthy': 5,
    'Aristocratic': 10
}

# Custom function to calculate average_age with lifestyle modifiers
def calculate_average_age(row):
    try:
        # Initialize variables
        weighted_ages = 0
        total_population = row['settlement_population']
        lifestyle_adjustment = 0

        # Convert strings to dictionaries if necessary
        demographic_breakdown = ast.literal_eval(row['demographic_breakdown']) if isinstance(row['demographic_breakdown'], str) else row['demographic_breakdown']
        class_density = ast.literal_eval(row['class_density']) if isinstance(row['class_density'], str) else row['class_density']

        # Loop over each species in the demographic breakdown
        for species, count in demographic_breakdown.items():
            # Get the age range for this species
            age_range = species_age_ranges.get(species, [1, 100])  # default to human age range if species is not found

            # Calculate the average age for this species
            avg_age_species = sum(age_range) / 2

            # Calculate the weighted age for this species
            weighted_age_species = avg_age_species * (count / total_population)

            # Add to the total weighted ages
            weighted_ages += weighted_age_species

        # Calculate lifestyle adjustment
        for lifestyle, count in class_density.items():
            modifier = lifestyle_modifier.get(lifestyle, 0)
            lifestyle_adjustment += (count / total_population) * modifier

        # Apply lifestyle adjustment to weighted ages
        average_age = weighted_ages + lifestyle_adjustment

        return average_age
    except Exception as e:
        print(f"Error calculating average age for row: {e}")
        return None

# Make sure to apply the function again
df['average_age'] = df.apply(calculate_average_age, axis=1)

# Handle the None values before casting to int
df['average_age'] = df['average_age'].fillna(0).astype(int)

# Apply the function across the DataFrame rows
df['average_age'] = df.apply(calculate_average_age, axis=1)
df['average_age'] = df['average_age'].astype(int)

In [None]:
df['average_age'].head(25)

0     76
1     53
2     90
3     67
4     40
5     91
6     35
7     38
8     36
9     67
10    66
11    41
12    45
13    42
14    34
15    36
16    47
17    41
18    54
19    37
20    30
21    45
22    32
23    42
24    42
Name: average_age, dtype: int64

# Rumored Treasure Value

In [None]:
size_modifier = {
    'tiny_settlement': 0.09,
    'small_settlement': 0.09,
    'medium_settlement': 0.1,
    'large_settlement': 0.2,
    'mega_settlement': 0.5
}

# Calculate the base_value
idf['base_value'] = idf['settlement_economy'] / 1000

# Apply size_modifier based on 'settlement_size', use .get() to handle cases where 'settlement_size' might be None
idf['modified_hidden_economy'] = idf['hidden_economy'] * idf['settlement_size'].map(size_modifier).fillna(1)

# Calculate the rumored_treasure_value
idf['rumored_treasure_value'] = idf['base_value'] + idf['modified_hidden_economy']

# Cast to int, ensuring no None values
idf['rumored_treasure_value'] = idf['rumored_treasure_value'].astype(int)

# Drop the intermediate columns if they are no longer needed
idf.drop(columns=['base_value', 'modified_hidden_economy'], inplace=True)

In [None]:
comparison_df = idf[['rumored_treasure_value', 'settlement_economy', 'hidden_economy']].tail()
print(comparison_df)

     rumored_treasure_value  settlement_economy  hidden_economy
844                    1357              503539            4268
845                       0                  95               0
846                       0                  72               0
847                       1                1549               0
848                       3                3259               1


# Goverment Types

In [None]:
# List of government types
government_types = ["Autocracy", "Bureaucracy", "Confederacy", "Democracy", "Dictatorship", "Feudalism", "Gerontocracy", "Hierarchy",
                    "Monarchy", "Magocracy", "Militocracy", "Matriarchy", "Oligarchy", "Patriarchy", "Meritocracy", "Plutocracy",
                    "Republic", "Kleptocracy", "Satrapy", "Theocracy"]

# Add a new column "government_type" with random values
df['government_type'] = [random.choice(government_types) for _ in range(len(df))]

# Print the DataFrame to verify the new column
print(df[['region', 'government_type']])


       region government_type
0    Aglarond     Kleptocracy
1    Aglarond     Meritocracy
2    Aglarond     Militocracy
3    Aglarond       Theocracy
4    Aglarond      Plutocracy
..        ...             ...
844     Vaasa    Dictatorship
845  Vesperin    Gerontocracy
846  Vesperin     Meritocracy
847  Vesperin     Confederacy
848  Vesperin     Confederacy

[849 rows x 2 columns]


# Cause of Death

In [None]:
# Define causes of death for each settlement size
causes_of_death_by_size = {
    'tiny_settlement': ['Old age', 'Wildlife attacks', 'Starvation', 'Disease', 'Exposure', 'Accidental falls', 'Lack of healers', 'Poisonous plants', 'Isolation', 'Necromancy'],
    'small_settlement': ['Disease', 'Poison', 'Old age', 'Starvation', 'Banditry', 'Elementals', 'Cursed artifacts', 'Fey mischief'],
    'medium_settlement': [ 'Old age', 'Assassination', 'Alchemy accidents', 'Guild conflicts', 'Dragonfire', 'Orc raids', 'Dark magic', 'Cult activities'],
    'large_settlement': [ 'Assassination', 'Necromancy', 'Old age', 'Arcane experiments', 'Thieves guilds', 'Political intrigue', 'Demonic incursions',  'Tavern brawls'],
    'mega_settlement': ['Political assassinations', 'Dark sorcery', 'Starvation', 'Disease', 'Exposure', 'Arcane disasters', 'Underdark creatures', 'Urban crime', 'Gang wars', 'Demonic pacts',]
}

# Define regional causes of death that are likely to affect multiple settlements in the same region
regional_causes_of_death = ['Plague', 'War', 'Goblin hordes', 'Famine', 'Dragon attacks', 'Undead uprisings', 'Necromantic disasters', 'Elemental storms']

# Select one primary regional cause for each region
primary_regional_cause = {
    region: np.random.choice(regional_causes_of_death)
    for region in df['region'].unique()
}

# Function to pick causes of death based on settlement size and the primary regional cause
def pick_causes_of_death(size, region, primary_regional_cause):
    # Get size-specific causes
    size_specific_causes = causes_of_death_by_size[size]

    # Combine size-specific causes with the primary regional cause
    combined_causes = size_specific_causes + [primary_regional_cause[region]]

    # Normalize the weights to give the primary regional cause a higher chance
    weights = [1] * len(size_specific_causes) + [5]  # Adjust the weight for the primary regional cause as needed

    # Pick 5 unique causes, ensuring the primary regional cause has a higher likelihood
    chosen_causes = np.random.choice(combined_causes, size=5, replace=False, p=np.array(weights) / sum(weights)).tolist()

    return chosen_causes

# Add a new column "settlement_size" based on population if not already present
if 'settlement_size' not in df.columns:
    df['settlement_size'] = df['settlement_population'].apply(determine_settlement_size)

# Apply the function to assign causes of death to each settlement
df['most_likely_cause_of_death'] = df.apply(lambda x: pick_causes_of_death(x['settlement_size'], x['region'], primary_regional_cause), axis=1)

# Verify the results
print(df[['settlement', 'region', 'settlement_size', 'most_likely_cause_of_death']])

       settlement    region    settlement_size  \
0           Corth  Aglarond   small_settlement   
1           Dahst  Aglarond   small_settlement   
2           Dlusk  Aglarond   small_settlement   
3          Findar  Aglarond   small_settlement   
4    Furthinghome  Aglarond   large_settlement   
..            ...       ...                ...   
844    Telos City     Vaasa   large_settlement   
845  Dragon Falls  Vesperin   small_settlement   
846       Galaunt  Vesperin   small_settlement   
847  Ravens Bluff  Vesperin  medium_settlement   
848       Tantras  Vesperin  medium_settlement   

                            most_likely_cause_of_death  
0    [Undead uprisings, Disease, Fey mischief, Old ...  
1    [Undead uprisings, Banditry, Old age, Fey misc...  
2    [Fey mischief, Old age, Undead uprisings, Pois...  
3    [Cursed artifacts, Undead uprisings, Poison, D...  
4    [Assassination, Undead uprisings, Tavern brawl...  
..                                                 ...  


##Trade Deficit


In [None]:
# Define a function to safely convert string representations of lists into actual lists
def convert_to_list(column):
    # Check if the column is a string and attempt to parse it
    if isinstance(column, str):
        try:
            # Interpret the string as a list
            return ast.literal_eval(column)
        except (ValueError, SyntaxError):
            # In case of error, return an empty list
            return []
    elif isinstance(column, list):
        # If it's already a list, just return it as is
        return column
    else:
        # If it's neither (e.g., NaN or some other type), return an empty list
        return []

# Apply the conversion function to 'exports' and 'imports'
idf['exports'] = idf['exports'].apply(convert_to_list)
idf['imports'] = idf['imports'].apply(convert_to_list)

# Count the items in 'exports' and 'imports' if they represent trade volume
# Make sure to only count if the item is a list
idf['exports_value'] = idf['exports'].apply(lambda x: len(x) if isinstance(x, list) else 0)
idf['imports_value'] = idf['imports'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Calculate the trade balance now that 'exports_value' and 'imports_value' are numerical
idf['trade_balance'] = idf['exports_value'] - idf['imports_value']

# Normalize the 'trade_balance' between 0 and 1
max_trade_balance = idf['trade_balance'].max()
min_trade_balance = idf['trade_balance'].min()
idf['normalized_trade_balance'] = (idf['trade_balance'] - min_trade_balance) / (max_trade_balance - min_trade_balance)

# Inequality Score

In [None]:
# Define weights for factors
weights = {
    'wealth_distribution': 0.4,
    'trade_balance': 0.1,
    'hidden_economy': 0.4,
    'average_age': 0.1
}

# Function to convert 'class_density' from string to dictionary if necessary
def parse_class_density(x):
    try:
        return ast.literal_eval(x)
    except (SyntaxError, ValueError):
        return {}

# Parse the 'class_density' column into dictionaries
idf['class_density'] = idf['class_density'].apply(parse_class_density)

# Normalize 'hidden_economy' between 0 and 1
max_hidden_economy = idf['hidden_economy'].max()
min_hidden_economy = idf['hidden_economy'].min()
idf['normalized_hidden_economy'] = (idf['hidden_economy'] - min_hidden_economy) / (max_hidden_economy - min_hidden_economy)

# normalize average age
max_age = idf['average_age'].max()
min_age = idf['average_age'].min()
idf['normalized_age'] = (idf['average_age'] - min_age) / (max_age - min_age)

# Calculate the inverted normalized age
idf['inverted_normalized_age'] = 1 - idf['normalized_age']

# Normalize 'wealth' between 0 and 1 (assuming 'wealth' is a column in your DataFrame)
max_wealth = idf['settlement_economy'].max()
min_wealth = idf['settlement_economy'].min()
idf['normalized_wealth'] = (idf['settlement_economy'] - min_wealth) / (max_wealth - min_wealth)

# Now calculate the inequality score
idf['inequality_score'] = (
    weights['wealth_distribution'] * idf['normalized_wealth'] +
    weights['trade_balance'] * idf['normalized_trade_balance'] +
    weights['hidden_economy'] * idf['normalized_hidden_economy'] +
    weights['average_age'] * idf['inverted_normalized_age']
)

# Fill NaN values with 0 before converting to int
idf['inequality_score'] = idf['inequality_score'].fillna(0) * 100

# Now safely convert to int
idf['inequality_score'] = idf['inequality_score'].astype(int)

# Generate random factors as a numpy array for performance
random_factors = np.random.uniform(0.95, 1.05, idf.shape[0])

# Multiply inequality_score by the random factor for each row, vectorized operation
idf['inequality_score'] *= random_factors

# Convert inequality score to a more manageable range if necessary, e.g., 0-100
#idf['inequality_score'] = (idf['inequality_score'] * 100).astype(int)

# Check the head of the updated DataFrame
idf[['class_density', 'normalized_wealth', 'inequality_score']].head()

Unnamed: 0,class_density,normalized_wealth,inequality_score
0,"{'Aristocratic': 0, 'Wealthy': 3, 'Comfortable...",2.355825e-07,7.99088
1,"{'Aristocratic': 0, 'Wealthy': 2, 'Comfortable...",1.225029e-07,8.222365
2,"{'Aristocratic': 0, 'Wealthy': 7, 'Comfortable...",3.360978e-07,8.291196
3,"{'Aristocratic': 0, 'Wealthy': 2, 'Comfortable...",1.225029e-07,7.632881
4,"{'Aristocratic': 197, 'Wealthy': 1003, 'Comfor...",0.002497216,8.588761


In [None]:
idf['inequality_score'].describe()

count    849.000000
mean       9.193708
std        4.466732
min        0.000000
25%        8.154383
50%        8.763610
75%        9.157462
max       91.627572
Name: inequality_score, dtype: float64

# Tax Rate

In [None]:
# Calculate a base tax rate for each region based on the number of aristocrats
def calculate_base_tax_rate(aristocrats_per_region):
    # You can introduce more sophisticated logic here based on your dataset
    return aristocrats_per_region.apply(lambda x: 10 if x < 50 else (20 if x < 100 else 30))

base_tax_rate_per_region = calculate_base_tax_rate(aristocrats_per_region)

def calculate_tax_rate(row, aristocrats_per_region, base_tax_rate_per_region):
    # Check if the region has a capital
    has_capital = pd.notnull(row['capital'])

    # Use the regional base tax rate if there's no capital, else use the detailed calculation
    if not has_capital:
        # Introduce zero tax for some settlements randomly or based on a condition
        return 0 if np.random.rand() < 0.1 else base_tax_rate_per_region[row['region']]

    region_imports = len(row['imports']) if isinstance(row['imports'], list) else 0
    region_exports = len(row['exports']) if isinstance(row['exports'], list) else 0
    region_aristocrats = aristocrats_per_region[row['region']]

    # Base tax rate conditions
    base_conditions = [
        row['military'] and region_aristocrats >= 50,
        region_aristocrats >= 100
    ]

    # Base tax rates corresponding to the above conditions
    base_tax_rates = [
        np.random.randint(20, 61),
        np.random.randint(10, 31)
    ]

    # If conditions are not met, use the region's base tax rate
    tax_rate = np.select(base_conditions, base_tax_rates, default=base_tax_rate_per_region[row['region']])

    # Calculate multiplier based on trade volume
    trade_volume = region_imports + region_exports
    trade_multiplier = 1 + (trade_volume / 100)

    # Apply the multiplier to the tax rate
    return round(tax_rate * trade_multiplier)

# Apply the calculate_tax_rate function to create the tax_rate column
df['tax_rate'] = df.apply(lambda row: calculate_tax_rate(row, aristocrats_per_region, base_tax_rate_per_region), axis=1)

In [None]:
df['tax_rate'].describe()

count    849.000000
mean      30.480565
std       15.561500
min        0.000000
25%       21.000000
50%       30.000000
75%       42.000000
max       60.000000
Name: tax_rate, dtype: float64

In [None]:
# Group by 'region' and sum the 'inequality_score' for each region
region_inequality_sum = idf.groupby('region')['inequality_score'].sum()

# Sort the regions by the summed inequality score in descending order and take the top 10
top_10_inequal_regions = region_inequality_sum.sort_values(ascending=False).head(10)

# Print the top 10 most unequal regions
print("Top 10 Most Unequal Regions:")
print(top_10_inequal_regions)


Top 10 Most Unequal Regions:
region
The Sword Coast               985.994628
The Eastern Shaar             468.745737
Halruaa                       355.320586
Thay                          327.485620
Dambrath                      288.172063
Murghom                       269.368888
Damara                        263.404874
Tethyr                        258.069974
The Moonshae Isles            253.114499
The Great Glaciar Pelvuria    250.354155
Name: inequality_score, dtype: float64


In [None]:
# Group by 'region' and sum the 'inequality_score' for each region
region_inequality_sum = idf.groupby('settlement')['inequality_score'].sum()

# Sort the regions by the summed inequality score in descending order and take the top 10
top_10_inequal_regions = region_inequality_sum.sort_values(ascending=False).head(10)

# Print the top 10 most unequal regions
print("Top 10 Most Unequal Settlements:")
print(top_10_inequal_regions)


Top 10 Most Unequal Settlements:
settlement
Waterdeep             91.627572
Darromar              71.991158
Thaymount             57.833094
Cathyr                22.941688
Bloodstone Village    21.443902
Taruin                21.316120
Rassatan              20.858166
Elveswatch            19.689059
Prastuil              19.627946
Purl                  19.509598
Name: inequality_score, dtype: float64


# Adjusting Average Age

In [None]:
# Calculate the regional average of the inequality score
region_inequality_avg = idf.groupby('region')['inequality_score'].mean()

# Define the minimum and maximum age for each region
idf['region_min_age'] = idf.groupby('region')['average_age'].transform('min')
idf['region_max_age'] = idf.groupby('region')['average_age'].transform('max')

# Define a function to adjust average_age based on regional inequality score
def adjust_average_age(row):
    # Access the region's average inequality score
    avg_inequality_score = region_inequality_avg[row['region']]

    # Define a scaling factor to control the adjustment
    scaling_factor = 2  # Adjust this value as needed

    # Calculate the adjusted average age
    adjusted_age = row['average_age'] - scaling_factor * (row['inequality_score'] - avg_inequality_score)

    # Ensure the adjusted age is within the reasonable range for the region
    adjusted_age = max(min(adjusted_age, row['region_max_age']), row['region_min_age'])

    return adjusted_age

# Apply the adjustment function to the DataFrame
idf['adjusted_average_age'] = idf.apply(adjust_average_age, axis=1)

# Ensure the 'capital' column is a string for comparison
idf['capital'] = idf['capital'].astype(str)

# List of all unique capitals
capitals = idf['capital'].unique()

# Filter out the capitals from the settlements
non_capital_settlements = idf[~idf['settlement'].isin(capitals)]

# Randomly choose 10 settlements to add +50, ensuring they are not capitals
add_settlements = non_capital_settlements.sample(n=10, random_state=1).index

# Randomly choose 5 settlements to subtract -30, ensuring they are not capitals and not in the add_settlements
subtract_settlements = non_capital_settlements.drop(index=add_settlements).sample(n=5, random_state=1).index

# Add +50 to the chosen settlements' 'adjusted_average_age'
idf.loc[add_settlements, 'adjusted_average_age'] += 50

# Subtract 30 from the chosen settlements' 'adjusted_average_age'
idf.loc[subtract_settlements, 'adjusted_average_age'] -= 30

# Convert the 'adjusted_average_age' column to integers
idf['adjusted_average_age'] = idf['adjusted_average_age'].astype(int)


In [None]:
idf['adjusted_average_age'].head(50)

0     73.965063
1     49.502092
2     89.364432
3     69.681061
4     39.769302
5     93.000000
6     34.000000
7     36.818603
8     37.532320
9     64.836528
10    65.068506
11    38.749634
12    45.154504
13    40.429589
14    34.000000
15    36.456795
16    51.149453
17    39.556512
18    46.679196
19    35.310185
20    27.158501
21    43.538021
22    36.137848
23    46.000000
24    42.897852
25    38.820286
26    41.580232
27    41.990093
28    30.000000
29    39.084224
30    52.000000
31    23.922251
32    52.000000
33    45.236065
34    40.129527
35    31.691232
36    52.000000
37    44.772581
38    21.000000
39    40.363313
40    36.411043
41    35.315342
42    37.058807
43    37.696814
44    33.994193
45    41.022458
46    34.993388
47    41.197285
48    49.474776
49    43.060545
Name: adjusted_average_age, dtype: float64

In [None]:
df['average_age'] = idf['adjusted_average_age']

# Temperature and Climate

In [None]:
region_climate = {
    'Aglarond': 'Temperate',
    'Akanul': 'Temperate',
    'Altumbel': 'Temperate',
    'Amn': 'Temperate',
    'Anauroch': 'Arid',
    'Calimshan': 'Arid',
    'Chessenta': 'Temperate',
    'Cormanthyr': 'Temperate',
    'Cormyr': 'Temperate',
    'Dalelands': 'Temperate',
    'Damara': 'Cold',
    'Dambrath': 'Temperate',
    'The Sword Coast': 'Temperate',
    'Elturgard': 'Temperate',
    'Erlkazar': 'Temperate',
    'Evereska': 'Temperate',
    'Halruaa': 'Temperate',
    'Hartsvale': 'Cold',
    'Icewind Dale': 'Frigid',
    'Impiltur': 'Temperate',
    'Jungles of Chult': 'Jungle/Marsh',
    'Lantan': 'Temperate',
    'Luiren': 'Temperate',
    'Mulhorand': 'Arid',
    'Muranndin': 'Temperate',
    'Murghom': 'Arid',
    'Najara': 'Temperate',
    'Narfell': 'Cold',
    'Nelanther Isles': 'Temperate',
    'Okoth': 'Arid',
    'The Eastern Shaar': 'Arid',
    'Rashemen': 'Cold',
    'Ruathym': 'Cold',
    'Samarach': 'Jungle/Marsh',
    'Sembia': 'Temperate',
    'Serpentes': 'Temperate',
    'Sespech': 'Temperate',
    'Tashalar': 'Temperate',
    'Tethyr': 'Temperate',
    'Tharsult': 'Arid',
    'Thay': 'Arid',
    'The Endless Wastes': 'Arid',
    'Sossal': 'Cold',
    'The Great Dale': 'Temperate',
    'The Great Glacier Pelvuria': 'Frigid',
    'The Lake of Steam': 'Temperate',
    'The Moonshae Isles': 'Jungle/Marsh',
    'The Purple Rocks': 'Temperate',
    'The Ride': 'Temperate',
    'Thar': 'Arid',
    'The Sea of Fallen Stars': 'Temperate',
    'The Shining Lands': 'Temperate',
    'The Shining Sea': 'Temperate',
    'Thesk': 'Temperate',
    'Thindol': 'Temperate',
    'Tuern': 'Cold',
    'Turmish': 'Temperate',
    'Tymanther': 'Arid',
    'Unther': 'Arid',
    'Vaasa': 'Cold',
    'Vesperin': 'Temperate'
}
climate_temperature_range = {
    'Arid': [65, 100],  # Temperature range in Fahrenheit
    'Cold': [0, 40],
    'Frigid':[-40, 10],
    'Temperate': [45, 70],
    'Jungle/Marsh': [65, 90]
}

average_rainfall = {
    'Arid': [0, 10],  # Rainfall range in inches per year
    'Cold': [20, 40],
    'Frigid':[0, 20],
    'Temperate': [30, 50],
    'Jungle/Marsh': [45, 65]
}

def assign_climate_attributes(row):
    try:
        # Extract climate from region
        climate = region_climate.get(row['region'], 'Temperate')  # Default to 'Temperate' if region is not found

        # Get temperature and rainfall ranges for the climate
        temp_range = climate_temperature_range[climate]
        rainfall_range = average_rainfall[climate]

        # Randomly assign values within the ranges
        average_temp = random.uniform(temp_range[0], temp_range[1])
        annual_rainfall = random.uniform(rainfall_range[0], rainfall_range[1])

        return average_temp, annual_rainfall

    except Exception as e:
        print(f"Error assigning climate attributes for row: {e}")
        return None

# Apply the function to the DataFrame
df['average_temperature'], df['annual_rainfall'] = zip(*df.apply(assign_climate_attributes, axis=1))


# Magic Academies

In [None]:
wizard_schools = {
    'Arrabar': 'Academia Vilhonus',
    'Chondath': 'Academia Vilhonus',
    'Thay': 'Academy of Shapers and Binders',
    ' Waterdeep': 'Academy of Stargazers',
    'Mintar': 'Academy of the Drawn Sword',
    ' Lake of Steam': 'Academy of the Drawn Sword',
    'Seventon': "Alimon's Mental Academy",
    'Netheril': "Alimon's Mental Academy",
    'Sundabar': 'Anstruth',
    'Gheldaneth': 'Arcanum of Magic',
    'Mulhorand': 'Arcanum of Magic',
    'Silverymoon': "Arkhen's Invocatorium",
    'Huzuz': "Al-Bidir Sallah (Home of the Seekers of Knowledge)",
    'Zakhara': "Al-Bidir Sallah (Home of the Seekers of Knowledge)",
    'Waterdeep': 'Blackstaff Academy',
    'Luskan': 'Blackstaff Academy',
    'Baldurs Gate': "Briel's School of the Arcane",
    'The Sword Coast': "Briels School of the Arcane",
    'Zazesspur': 'Canaith',
    'Tethyr': 'Canaith',
    'Evereska': 'College of Magic and Arms',
    'The Sword Coast Faerûn': 'College of Magic and Arms',
    'Heralds Holdfast': 'College of the Herald',
    'Berdusk': 'College of Doss',
    'Myth Nantar': "Dukars' Academy",
    'Waterdeep': 'Dweomercore',
    'Waterdeep': 'Eltorchul Academy',
    'Loudwater': "Enalin's Shop of Adventurers' Supplies",
    'Delimbiyr Vale': "Enalin's Shop of Adventurers' Supplies",
    'Heliogabalus': 'Ends Rest',
    'Heldapan': 'Gathering of Magicians',
    'Durpar': 'Gathering of Magicians',
    'Akkaido': 'Golden Fox Martial Arts School',
    'Huzuz': 'Grand University of Huzuz',
    'Medina al-Afyal': 'Great Mosque of Selan',
    'Island of Afyal': 'Great Mosque of Selan',
    'Almorel': 'Great School of Deneir',
    'Waterdeep': 'Halasters Heirs',
    'Evereska': 'Hall of the Kaliesherai',
    'Waterdeep': 'House of Healing'
}

def assign_magic_academy(row):
    settlement = row['settlement']
    if settlement in wizard_schools:
        return wizard_schools[settlement]
    else:
        return 'None'  # Fill with 'None' if there's no school

# Assuming you have a DataFrame named df
df['magic_academy'] = df.apply(assign_magic_academy, axis=1)

# To keep track of schools that do not get placed
unplaced_schools = [school for school in wizard_schools.values() if school not in df['magic_academy'].unique()]

# Print unplaced schools
print("Unplaced Schools:", unplaced_schools)

Unplaced Schools: ['Academia Vilhonus', 'Academia Vilhonus', 'Academy of Shapers and Binders', 'Academy of Stargazers', "Alimon's Mental Academy", "Alimon's Mental Academy", 'Arcanum of Magic', 'Arcanum of Magic', 'Grand University of Huzuz', 'Al-Bidir Sallah (Home of the Seekers of Knowledge)', "Briel's School of the Arcane", 'Briels School of the Arcane', 'College of Magic and Arms', 'College of the Herald', "Dukars' Academy", "Enalin's Shop of Adventurers' Supplies", "Enalin's Shop of Adventurers' Supplies", 'Gathering of Magicians', 'Gathering of Magicians', 'Golden Fox Martial Arts School', 'Great Mosque of Selan', 'Great Mosque of Selan']


In [None]:
def add_magic_academy(df):
    def check_population(row):
        if row['settlement_population'] > 15000 and row['magic_academy'] == 'None':
            # If population is over 10000 and academy is 'None', replace it with 'guilds'
            return 'guilds'
        return row['magic_academy']

    df['magic_academy'] = df.apply(lambda row: check_population(row), axis=1)
    return df
df = add_magic_academy(df)

In [None]:
# Check the count of each unique value in the 'magic_academy' column
academy_counts = df['magic_academy'].value_counts()

# Check if 'guilds' is in the index of unique values
if 'guilds' in academy_counts.index:
    # 'guilds' was added, and you can access its count using academy_counts['guilds']
    guilds_count = academy_counts['guilds']
    print(f"'guilds' was added {guilds_count} times.")
else:
    # 'guilds' was not added
    print("'guilds' was not added.")


'guilds' was added 117 times.


# Magical Climate

In [None]:
# Helper function to parse demographic breakdown
def parse_demographic_breakdown(x):
    if isinstance(x, dict):
        return x
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return {}

# Apply the helper function to the 'demographic_breakdown' column
idf['demographic_breakdown'] = idf['demographic_breakdown'].apply(parse_demographic_breakdown)

# Vectorized calculations for non-human count and total population
demographics = pd.DataFrame(idf['demographic_breakdown'].tolist()).fillna(0)
idf['non_human_count'] = demographics.apply(lambda row: row.drop('Human', errors='ignore').sum(), axis=1)
idf['total_population_from_demo'] = demographics.sum(axis=1)

# Vectorized calculation for non-human proportion
idf['non_human_proportion'] = np.divide(idf['non_human_count'], idf['total_population_from_demo'], out=np.zeros_like(idf['non_human_count']), where=idf['total_population_from_demo'] != 0)

# Score for magic academy presence
idf['academy_score'] = idf['magic_academy'].notna().astype(int) * 30

# Normalize and score calculations with vectorized operations
min_age, max_age = idf['average_age'].min(), idf['average_age'].max()
mean_inequality, std_inequality = idf['inequality_score'].mean(), idf['inequality_score'].std()
min_non_human_proportion, max_non_human_proportion = idf['non_human_proportion'].min(), idf['non_human_proportion'].max()

random_factors = np.random.uniform(1, 5, (len(idf), 4))

moderate_temp = 70

idf['age_score'] = random_factors[:, 0] * ((idf['average_age'] - min_age) / (max_age - min_age))
idf['inequality_score'] = random_factors[:, 1] * ((idf['inequality_score'] - mean_inequality) / std_inequality)
idf['temperature_score'] = random_factors[:, 2] * (idf.get('average_temperature', 0) - moderate_temp)
idf['non_human_score'] = random_factors[:, 3] * ((idf['non_human_proportion'] - min_non_human_proportion) / (max_non_human_proportion - min_non_human_proportion))

# Normalize each score to be between 1 and its respective scale factor
scales = np.array([24, 19, 34, 29])  # Subtracting one because we add one below
idf[['age_score', 'inequality_score', 'temperature_score', 'non_human_score']] = (
    1 + scales * (idf[['age_score', 'inequality_score', 'temperature_score', 'non_human_score']] -
    idf[['age_score', 'inequality_score', 'temperature_score', 'non_human_score']].min()) /
    (idf[['age_score', 'inequality_score', 'temperature_score', 'non_human_score']].max() -
    idf[['age_score', 'inequality_score', 'temperature_score', 'non_human_score']].min())
)

# Sum up the normalized scores to get the magical_climate
idf['magical_climate'] = idf[['academy_score', 'age_score', 'inequality_score', 'temperature_score', 'non_human_score']].sum(axis=1)

# Optional: Clip the total score if necessary
idf['magical_climate'] = idf['magical_climate'].round().clip(lower=4, upper=100)

In [None]:
df['magical_climate'] = idf['magical_climate']

# Dragon Sightings

In [None]:
# Step 1: Calculate Region Totals
region_totals = idf.groupby('region')['hidden_economy', 'magical_climate'].sum().reset_index()

# Step 2: Allocate Dragon Sightings to Regions
total_dragon_sightings = 1000
region_totals['total_score'] = region_totals['hidden_economy'] + region_totals['magical_climate']
region_totals['allocated_dragon_sightings'] = (region_totals['total_score'] / region_totals['total_score'].sum()) * total_dragon_sightings

# Step 3: Merge the allocated_dragon_sightings to the original DataFrame
idf = idf.merge(region_totals[['region', 'allocated_dragon_sightings']], on='region', how='left')

# Step 4: Allocate Dragon Sightings to Settlements
idf['settlement_dragon_sightings'] = (idf['magical_climate'] / idf.groupby('region')['magical_climate'].transform('sum')) * idf['allocated_dragon_sightings']

# Apply Random Factor to Magical Climate and Allocate Dragon Sightings
random_factor = np.random.uniform(0.1, 2, size=len(idf))
idf['adjusted_magical_climate'] = idf['magical_climate'] * random_factor
idf['settlement_dragon_sightings'] = (idf['adjusted_magical_climate'] / idf.groupby('region')['adjusted_magical_climate'].transform('sum')) * idf['allocated_dragon_sightings']

# Optional: Round the dragon sightings
idf['settlement_dragon_sightings'] = idf['settlement_dragon_sightings'].round().astype(int)

  region_totals = idf.groupby('region')['hidden_economy', 'magical_climate'].sum().reset_index()


In [None]:
df['dragon_sightings'] = idf['settlement_dragon_sightings']

In [None]:
# Group by 'region' and sum 'settlement_dragon_sightings'
total_dragon_sightings_by_region = idf.groupby('region')['settlement_dragon_sightings'].sum().reset_index()

# Optional: Sort values for better visualization
total_dragon_sightings_by_region = total_dragon_sightings_by_region.sort_values(by='settlement_dragon_sightings', ascending=False).reset_index(drop=True)

# Display the result
print(total_dragon_sightings_by_region)

             region  settlement_dragon_sightings
0   The Sword Coast                          469
1            Tethyr                          267
2              Thay                          205
3         Elturgard                           16
4         Mulhorand                            9
..              ...                          ...
56          Narfell                            0
57  Nelanther Isles                            0
58            Okoth                            0
59         Rashemen                            0
60         Vesperin                            0

[61 rows x 2 columns]


In [None]:
# Step 1
# Define a function to get random numbers for settlement sizes
def get_random_settlement_size(settlement_type):
    size_ranges = {
        'tiny_settlement': (5, 15),
        'small_settlement': (5, 11),
        'medium_settlement': (4, 10),
        'large_settlement': (2, 7),
        'mega_settlement': (1, 8)
    }
    return np.random.uniform(*size_ranges[settlement_type])

# Apply the function to each settlement_size
idf['average_household_size'] = idf['settlement_size'].apply(get_random_settlement_size)
idf['average_household_size'].fillna(idf['average_household_size'].mean(), inplace=True)
idf['total_households'] = (idf['settlement_population'] / idf['average_household_size']).astype(int)

# Step 2
# Filter out rows where class_density is not a dictionary
idf = idf[idf['class_density'].apply(lambda x: isinstance(x, dict))]

# Now you can use dictionary methods to extract values
idf[['Wealthy', 'Poor', 'Squalid']] = idf['class_density'].apply(
    lambda density: pd.Series((density.get('Wealthy', 0), density.get('Poor', 0), density.get('Squalid', 0)))
)

# Step 3
debt_base = idf['Poor'] * 7 + idf['Squalid'] * 15 - idf['Wealthy'] * 5
idf['average_debt_by_household'] = (1 + debt_base * idf['tax_rate'] / idf['total_households']).clip(lower=0)

# Ensure the total debt doesn't exceed hidden_economy
idf['average_debt_by_household'] = np.where(
    idf['average_debt_by_household'] * idf['total_households'] <= idf['hidden_economy'],
    idf['average_debt_by_household'],
    idf['hidden_economy'] / idf['total_households']
)

# Drop temporary columns if necessary
idf.drop(columns=['Wealthy', 'Poor', 'Squalid', 'total_households'], inplace=True)

In [None]:
idf['average_debt_by_household'].describe()

count    849.000000
mean       0.515758
std        7.617958
min        0.000000
25%        0.000000
50%        0.000000
75%        0.018754
max      208.831010
Name: average_debt_by_household, dtype: float64

In [None]:
df['average_debt_by_household'] = idf['average_debt_by_household']

In [None]:
idf['military'] = df['military']

# Function to determine development_index based on conditions
def assign_development_index(row):
    options = ['region improvement', 'administration']
    if row['military'] != 'None':
        options.append('military support')

    return random.choice(options)

# Applying the function to each row in the DataFrame
idf['development_index'] = idf.apply(assign_development_index, axis=1)

# If you want each region to have the same development_index, you can group by region and then transform:
idf['development_index'] = idf.groupby('region')['development_index'].transform('first')

In [None]:
df['development_index'] = idf['development_index']

In [None]:
# Export DataFrame to Excel
df.to_csv(df_file_path, index=False)

# Export the idf dataset to a CSV file
#idf.to_csv('/content/drive/MyDrive/Colab Notebooks/DnD/idf_dataset.csv', index=False)

print(f'DataFrame exported to: {df_file_path}')

DataFrame exported to: /content/drive/MyDrive/Colab Notebooks/DnD/updated_dataset.csv


In [None]:
    data = {
            "Number of Landowners": np.random.randint(1, 200),
            "Number of Homeowners": np.random.randint(10, 10000),
            "Average Rent": np.random.randint(1, 5000),
            "Likelihood of Getting Cursed": np.random.randint(0, 10),
            "Danger Level": np.random.randint(0, 10),
            "Openness to Strangers": np.random.randint(0, 100),
            "Political Climate": np.random.randint(0, 100),
            "Fate Ranking": np.random.randint(0, 10),
        }


In [None]:
# Adjusting Danger Level
df['Danger Level'] = (
    df['Dragon Sightings']
    + df['Political Climate']
    + df['Magical Climate']
    + df['Likelihood of Getting Cursed']
    + df['Tax Rate']
    + df['Average Debt by Household']
    + df['Rumored Treasure Value']
    + df['Openness to Strangers'].apply(lambda x: 10 if x <= 2.5 or x >= 7.5 else 0)
).astype(int)

# Increase "Danger Level" if Orcs are present
df.loc[df['Percentage of population Orcs'] > 0, 'Danger Level'] += 1

# Decrease "Danger Level" if Elves are present
df.loc[df['Percentage of population Elves'] > 0, 'Danger Level'] -= 1

# Adjusting Danger Level based on the presence of Orcs and Elves
df['Danger Level'] = df.apply(lambda row: np.random.randint(1, 11)
                                          + (3 if row['Percentage of population Orcs'] > 0 else 0)
                                          - (2 if row['Percentage of population Elves'] > 0 else 0), axis=1)

# Adjusting Openness to Strangers
df['Openness to Strangers'] = df.apply(lambda row: np.random.randint(0, 11)
                                                   - (3 if row['Percentage of population Elves'] > 0
                                                       or row['Percentage of population Orcs'] > 0 else 0), axis=1)

# Adjusting Magical Climate
df['Magical Climate'] = df.apply(lambda row: row['Magical Climate']
                                             + (row['Percentage of population Elves'] / 100 * 2)
                                             + (row['Dragon Sightings'] / 10)
                                             + row['Number of Active Wizards'], axis=1)

# Pre-calculation of sum values
grouped = df.groupby('Region')
region_gold_mapping = grouped[['Active Economy (Gold)', 'Hidden Economy (Gold)']].sum(numeric_only=True).sum(axis=1)
df['Total Gold in Region'] = df['Region'].map(region_gold_mapping)

# Main DataFrame Manipulations
df['Dragon Sightings'] = df.apply(calculate_dragon_sightings, axis=1)
df['Dragon Sightings'] = ((df['Dragon Sightings'] - df['Dragon Sightings'].min()) /
                          (df['Dragon Sightings'].max() - df['Dragon Sightings'].min()) * 50).astype(int)

df['Average Age of Population'] = df.apply(lambda row: calculate_average_age(row, species_age_ranges), axis=1)
df['Average Age of Population'] -= df['Danger Level'] * 0.5
df['Average Age of Population'] = df['Average Age of Population'].clip(lower=1)

df['Hidden Economy (Gold)'] = df['Number of Nobility'] * 1e4
df['Openness to Strangers'] = df.apply(calculate_openness_to_strangers, axis=1)

# Adjusting population distribution for specific species across cities
def adjust_population_distribution(df, species, percentage, city_count):
    total_population = df['Population of City'].sum()
    target_population = int(total_population * (percentage / 100))
    selected_cities = df.sample(city_count)
    each_city_population = target_population // city_count
    for i, city in selected_cities.iterrows():
        df.at[i, f'Percentage of population {species}'] = (each_city_population / city['Population of City']) * 100

# Adjusting Economy Distribution
total_active_economy = np.sum(df['Active Economy (Gold)'])
top_10_cities_active_economy = total_active_economy * 0.50
next_100_cities_active_economy = total_active_economy * 0.25
remaining_cities_active_economy = total_active_economy * 0.25

# Known Treasure and Rumored Treasure Calculations
df['Known Treasure'] = df['Active Economy (Gold)'] * 0.50
df['Rumored Treasure'] = (df['Active Economy (Gold)'] * 0.50) + df['Hidden Economy (Gold)']

# Adjusting Political Climate and Fate Ranking
df['Wizarding School'] = np.random.choice(['yes', 'no'], num_cities)
df['Number of Homeowners'] = (df['Population of City'] * np.random.uniform(0.5, 0.7)).astype(int)
df['Average Rent'] = np.random.uniform(500, 2000, num_cities)
df['Average Age of Population'] = np.random.uniform(25, 70, num_cities)
df['Known Treasure Value'] = np.random.uniform(0, 1e6, num_cities)

# Adjust the tax rate based on the number of nobility
df['Tax Rate'] = df.apply(lambda row: 0 if row['Number of Nobility'] == 0 else row['Tax Rate'], axis=1)

# (You need to define 'Average Debt by Household' and 'Rumored Treasure Value' before using them in the next formula)
df['Political Climate'] = (df['Tax Rate'] + df['Number of Homeowners']/df['Population of City'] +
                              df['Average Rent']/2000 + df['Average Debt by Household']/1e4 +
                              df['Average Age of Population']/70 + df['Hidden Economy (Gold)']/1e6 +
                              df['Known Treasure Value']/1e6 + df['Rumored Treasure Value']/1e6) / 8

df['Fate ranking'] = (df['Rumored Treasure Value'] / 1e6) + np.where(df['Wizarding School'] == 'yes', 1, 0) + df['Political Climate']

KeyError: ignored

In [None]:
# File path
df_file_path = '/content/drive/MyDrive/Colab Notebooks/DnD/fictional_data.xlsx'
pf_file_path = '/content/drive/MyDrive/Colab Notebooks/DnD/points.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(df_file_path)
# Read the Excel file into a DataFrame
pf = pd.read_excel(pf_file_path)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 849 entries, 0 to 848
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   settlement                  849 non-null    object 
 1   region                      849 non-null    object 
 2   settlement_population       849 non-null    int64  
 3   settlement_economy          849 non-null    float64
 4   rumored_treasure_value      849 non-null    int64  
 5   demographic_breakdown       849 non-null    object 
 6   average_age                 849 non-null    int64  
 7   most_likely_cause_of_death  849 non-null    object 
 8   government_type             849 non-null    object 
 9   class_density               849 non-null    object 
 10  tax_rate                    849 non-null    float64
 11  exports                     545 non-null    object 
 12  imports                     523 non-null    object 
 13  military                    441 non

In [None]:
columns_to_drop = [
    'size',
    'scale_factor',
    'scale_factor_normalized',
    'settlement_size'
]

# Dropping the columns from the DataFrame
df = df.drop(columns=columns_to_drop)
