In [3]:
import pandas as pd
from rapidfuzz import process
import geopandas as gpd
from bokeh.io import output_notebook, show, reset_output, output_file, save
from bokeh.models import HoverTool
from bokeh.palettes import linear_palette, Greens9, Greens256
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.io.export import export_png
from selenium import webdriver
import pycountry
import json

In [4]:
MANUAL_NATIONALITY_MAP = {
    "ghanaian": "Ghana",
    "nigerian": "Nigeria",
    "british": "United Kingdom",
    "american": "United States of America",
    "dutch": "Netherlands",
    "german": "Germany",
    "french": "France",
    "swiss": "Switzerland",
    "kenyan": "Kenya",
    "tanzanian": "Tanzania",
    "ugandan": "Uganda",
    "rwandan": "Rwanda",
    "burundian": "Burundi",
    "ethiopian": "Ethiopia",
    "zambian": "Zambia",
    "malawian": "Malawi",
    "botswanan": "Botswana",
    "zimbabwean": "Zimbabwe",
    "namibian": "Namibia",
    "south african": "South Africa",
    "ivorian": "Côte d'Ivoire",
    "congolese": "Congo (Brazzaville)",
    "dr congolese": "Congo (Kinshasa)",
    "cameroonian": "Cameroon",
    "senegalese": "Senegal",
    "burkinabe": "Burkina Faso",
    "sudanese": "Sudan",
    "somali": "Somalia",
    "malian": "Mali",
    "nigerien": "Niger",
    "liberian": "Liberia",
    "sierra leonean": "Sierra Leone",
    "gambian": "Gambia",
    "guinean": "Guinea",
    "mozambican": "Mozambique",
    "angolan": "Angola",
    "chadian": "Chad",
    "eritrean": "Eritrea",
    "algerian": "Algeria",
    "moroccan": "Morocco",
    "tunisian": "Tunisia",
    "libyan": "Libya",
    "egyptian": "Egypt",
    "emirati": "United Arab Emirates",
    "saudi": "Saudi Arabia",
    "kuwaiti": "Kuwait",
    "qatari": "Qatar",
    "iranian": "Iran",
    "iraqi": "Iraq",
    "lebanese": "Lebanon",
    "palestinian": "Palestine",
    "syrian": "Syria",
    "jordanian": "Jordan",
    "israeli": "Israel",
    "turkish": "Türkiye",
    "indian": "India",
    "pakistani": "Pakistan",
    "bangladeshi": "Bangladesh",
    "nepali": "Nepal",
    "sri lankan": "Sri Lanka",
    "afghan": "Afghanistan",
    "chinese": "China",
    "japanese": "Japan",
    "korean": "South Korea",
    "vietnamese": "Vietnam",
    "thai": "Thailand",
    "filipino": "Philippines",
    "malaysian": "Malaysia",
    "indonesian": "Indonesia",
    "australian": "Australia",
    "canadian": "Canada",
    "mexican": "Mexico",
    "brazilian": "Brazil",
    "argentinian": "Argentina",
    "chilean": "Chile",
    "peruvian": "Peru",
    "colombian": "Colombia",
    "venezuelan": "Venezuela",
    "cuban": "Cuba",
    "haitian": "Haiti",
    "hungarian": "Hungary",
    "italiana": "Italy",
    "greek": "Greece",
    "swedish": "Sweden",
    "finnish": "Finland",
    "usa": "United States of America",
    "nederland": "Netherlands",
    "Spanish": "Spain",
    "nederlandse": "Netherlands",
    "turkey": "Turkey",
    "irish/english": "Ireland",
    "turkey": "Türkiye"
}

def build_combined_map():
    nationality_map = {}

    # Add countries from pycountry
    for country in pycountry.countries:
        name = country.name.lower()
        nationality_map[name] = country.name
        if hasattr(country, 'official_name'):
            nationality_map[country.official_name.lower()] = country.name

    # Add our manually curated list
    for k, v in MANUAL_NATIONALITY_MAP.items():
        nationality_map[k.lower()] = v

    return nationality_map

def fuzzy_map_nationality(series, threshold=85):
    nationality_map = build_combined_map()
    choices = list(nationality_map.keys())

    def map_one(value):
        if pd.isna(value) or not str(value).strip():
            return None
        value = value.lower().strip()
        match, score, _ = process.extractOne(value, choices)
        if score >= threshold:
            return nationality_map[match]
        return None  # or return value for fallback

    return series.apply(map_one)

def country_to_iso3(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except:
        return None

In [5]:
# Get membership data
df = pd.read_csv("membership.csv")
df = df[(df['Plan'] == 'MEMBERSHIP') | (df['Plan'] == 'Membership with automatic renewal') | (df['Plan'] == '1 year Membership')]
df.Nationality = df.Nationality.str.lower()
df['Country'] = fuzzy_map_nationality(df['Nationality'])

In [6]:
# Check if Nones in country list -> if so, update the mapping!
df[df['Country'].isna()][['Country', 'Nationality']]

Unnamed: 0,Country,Nationality


In [26]:
# Add count/frequency of each country
country_counts = df['Country'].value_counts().rename_axis('Country').reset_index(name='Count')
country_counts['iso_a3'] = country_counts['Country'].apply(country_to_iso3)
country_counts.loc[country_counts['Country'] == 'France', 'iso_a3'] = 'FRA'

# Load world map from GeoPandas
url = "https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson"
world = gpd.read_file(url)
world.loc[world['name'] == 'France', 'ISO3166-1-Alpha-3'] = 'FRA'
world.loc[world['name'] == 'Norway', 'ISO3166-1-Alpha-3'] = 'NOR'
world["geometry"] = world["geometry"].simplify(tolerance=0.05, preserve_topology=True)

# Merge frequency data with world GeoDataFrame
world = world.merge(country_counts, how='left', left_on='ISO3166-1-Alpha-3', right_on='iso_a3')
world['Count'] = world['Count'].fillna(0)

reset_output()

# Convert to GeoJSON
geo_source = GeoJSONDataSource(geojson=world.to_json())

In [27]:
# Create Bokeh plot
green_palette = ['#ffffff'] + list(linear_palette(Greens256[::-1][30:], 200))
color_mapper = LinearColorMapper(palette=green_palette, low=world['Count'].min(), high=world['Count'].max())

hover = HoverTool(tooltips=[
    ("Country", "@name"),
    ("Count", "@Count")
])

p = figure(
    title="Stamily Nationalities",
    toolbar_location="left",
    width=900,
    height=500,
    tools=[hover, 'pan', 'wheel_zoom', 'reset']
)

p.patches('xs', 'ys', source=geo_source,
          fill_color={'field': 'Count', 'transform': color_mapper},
          line_color="gray", line_width=0.5, fill_alpha=0.8)

color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, location=(0,0))
p.add_layout(color_bar, 'right')

output_file("stamily_map.html")
save(p)

show(p)

In [28]:
country_counts = df['Country'].value_counts().rename_axis('Country').reset_index(name='Count')

# Sort countries by count (optional)
country_counts = country_counts.sort_values('Count', ascending=False)

# Bokeh data source
source = ColumnDataSource(country_counts)

# Country names as x-axis labels
countries = country_counts['Country'].tolist()

# Create figure
p = figure(
    x_range=countries,
    height=400,
    width=800,
    title="Number of Stamily members per country (based on nationalities)",
    toolbar_location=None,
    tools=""
)

# Bar plot
p.vbar(
    x='Country',
    top='Count',
    width=0.6,
    source=source,
    line_color="white",
    fill_color=factor_cmap('Country', palette=green_palette[::-1], factors=countries)
)

# Add hover tool
hover = HoverTool(tooltips=[("Country", "@Country"), ("Count", "@Count")])
p.add_tools(hover)

# Style
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = 1.0  # Rotate x-axis labels if many countries

# Output file
output_file("country_barchart.html")
show(p)

export_png(p, filename="country_barchart.png")

'/home/jurjen/Documents/Python/stamily_in_data/membership/country_barchart.png'