## STEP 0 — Import all required libraries


In [18]:
import os
import math
import re

import numpy as np
import pandas as pd
import pymysql

import custom_functions  # cleaning & encoding helpers

pd.options.display.max_columns = 100


## STEP 1 — Define file paths and abbreviations

We list the 8 CSV files (health + environmental indicators) using the same paths as in `main.ipynb`.

They are split into:
- **Notation 1:** semicolon-delimited, clean files  
- **Notation 2:** comma-delimited, messy files requiring extra cleaning  


In [19]:
# Load the regions file 
regions_path = "region_list.csv"
regions_df = pd.read_csv(regions_path)

print("Original columns in region_list.csv:", regions_df.columns.tolist())
print(regions_df.head())

# Base folder where you keep the cleaned CSVs
DATA_DIR = "CLEANED_DATA"

# File paths for notation 1 ('che', 'wr', 'wu', 'sr', 'su', 'gem')
filepaths_1 = [
    os.path.join(DATA_DIR, "che_cleaned.csv"),
    os.path.join(DATA_DIR, "wr_cleaned.csv"),
    os.path.join(DATA_DIR, "wu_cleaned.csv"),
    os.path.join(DATA_DIR, "sr_cleaned.csv"),
    os.path.join(DATA_DIR, "su_cleaned.csv"),
    os.path.join(DATA_DIR, "gem_cleaned.csv"),
]

# File paths for notation 2 ('pop', 'ren')
filepaths_2 = [
    os.path.join(DATA_DIR, "pop_cleaned.csv"),
    os.path.join(DATA_DIR, "ren_cleaned.csv"),
]

# (unchanged) abbreviations
abbreviations_1 = ['che', 'wr', 'wu', 'sr', 'su', 'gem']
abbreviations_2 = ['pop', 'ren']



Original columns in region_list.csv: ['country_id', 'country_name', 'country_code', 'region']
   country_id          country_name country_code  \
0           1                Angola          AGO   
1           2               Albania          ALB   
2           3               Andorra          AND   
3           4            Arab World          ARB   
4           5  United Arab Emirates          ARE   

                                              region  
0                                             Africa  
1                            Europe and Central Asia  
2                            Europe and Central Asia  
3                                             REGION  
4  Middle East, North Africa, Afghanistan & Pakistan  


In [20]:
# ---- 1. Load regions file (already cleaned earlier) ----
regions_path = "region_list.csv"
regions_df = pd.read_csv(regions_path)

# Normalize columns
regions_df.columns = [c.strip().lower().replace(" ", "_") for c in regions_df.columns]

if "country_code" in regions_df.columns:
    country_col = "country_code"
elif "country code" in regions_df.columns:
    country_col = "country code"
else:
    raise ValueError("Could not find a 'country_code' column in region_list.csv")

if "region" not in regions_df.columns:
    raise ValueError("Could not find a 'region' column in region_list.csv")

regions_df = (
    regions_df[[country_col, "region"]]
    .drop_duplicates()
    .rename(columns={country_col: "country_code"})
)

print("regions_df preview:")
print(regions_df.head())

# ---- 2. Fresh connection + cursor (IMPORTANT) ----
conn = pymysql.connect(
    host="localhost",
    user="root",
    password="Jj698093738013!!",   # your real password
    database="bigdata_project",    # make sure we select the right DB
    autocommit=True
)
cursor = conn.cursor()
print("Connected to MySQL (bigdata_project)")

# Before update: how many countries already have region?
cursor.execute("SELECT COUNT(*) FROM countries WHERE region IS NOT NULL;")
print("Before update – countries with region set:", cursor.fetchone()[0])

# ---- 3. Apply updates ----
update_sql = """
    UPDATE countries
    SET region = %s
    WHERE country_code = %s;
"""

rows_updated = 0
for row in regions_df.itertuples(index=False):
    region_val = row.region
    cc_val     = row.country_code
    cursor.execute(update_sql, (region_val, cc_val))
    rows_updated += cursor.rowcount  # how many rows actually changed

print("Rows updated in countries.region:", rows_updated)

# After update: check again
cursor.execute("SELECT COUNT(*) FROM countries WHERE region IS NOT NULL;")
print("After update – countries with region set:", cursor.fetchone()[0])

# Small sample
cursor.execute("""
    SELECT country_code, country_name, region
    FROM countries
    WHERE region IS NOT NULL
    ORDER BY country_code
    LIMIT 10;
""")
print("\nSample of countries with region:")
for r in cursor.fetchall():
    print(r)

cursor.close()
conn.close()
print("\n✅ Region update completed.")


regions_df preview:
  country_code                                             region
0          AGO                                             Africa
1          ALB                            Europe and Central Asia
2          AND                            Europe and Central Asia
3          ARB                                             REGION
4          ARE  Middle East, North Africa, Afghanistan & Pakistan
Connected to MySQL (bigdata_project)
Before update – countries with region set: 0
Rows updated in countries.region: 261
After update – countries with region set: 261

Sample of countries with region:
('AGO', 'Angola', 'Africa')
('ALB', 'Albania', 'Europe and Central Asia')
('AND', 'Andorra', 'Europe and Central Asia')
('ARB', 'Arab World', 'REGION')
('ARE', 'United Arab Emirates', 'Middle East, North Africa, Afghanistan & Pakistan')
('ARG', 'Argentina', 'Latin America and the Caribbean')
('ARM', 'Armenia', 'Europe and Central Asia')
('ASM', 'American Samoa', 'OCEANIA')
('ATG', 

In [21]:
#Normalize column names and keep only country_code + region

# Make all column names lowercase, strip spaces, replace spaces with underscores
regions_df.columns = [
    c.strip().lower().replace(" ", "_") for c in regions_df.columns
]
print("Normalized columns:", regions_df.columns.tolist())

# Try to figure out which columns are country_code and region
if "country_code" in regions_df.columns:
    country_col = "country_code"
elif "country code" in regions_df.columns:
    country_col = "country code"
else:
    raise ValueError("Could not find a 'country_code' column in region_list.csv")

if "region" not in regions_df.columns:
    raise ValueError("Could not find a 'region' column in region_list.csv")

# Keep only what we need and drop duplicates
regions_df = (
    regions_df[[country_col, "region"]]
    .drop_duplicates()
    .rename(columns={country_col: "country_code"})
)

print("Cleaned regions_df:")
print(regions_df.head())
print("Unique regions:", regions_df["region"].unique())


Normalized columns: ['country_code', 'region']
Cleaned regions_df:
  country_code                                             region
0          AGO                                             Africa
1          ALB                            Europe and Central Asia
2          AND                            Europe and Central Asia
3          ARB                                             REGION
4          ARE  Middle East, North Africa, Afghanistan & Pakistan
Unique regions: ['Africa' 'Europe and Central Asia' 'REGION'
 'Middle East, North Africa, Afghanistan & Pakistan'
 'Latin America and the Caribbean' 'OCEANIA' 'South Asia'
 'East Asia and Pacific' 'North America' 'SOCIOECONOMIC']


In [22]:
# %% [markdown]
# ## STEP 2 — Load already-cleaned CSV files
#
# At this point, we are no longer working with the original messy World Bank CSVs.
# Instead, we use the pre-cleaned files:
#   - che_cleaned.csv, wr_cleaned.csv, wu_cleaned.csv,
#   - sr_cleaned.csv, su_cleaned.csv, gem_cleaned.csv,
#   - pop_cleaned.csv, ren_cleaned.csv
#
# These are standard comma-separated CSVs with a proper header row:
# ["Country Name","Country Code","Indicator Name","Indicator Code","1960","1961",...]
#
# Because they are clean, we do NOT need custom encoding detection or `clean_csv` here.
# We can read them directly with `pandas.read_csv`.

# Make sure DATA_DIR is already defined above as:
# DATA_DIR = "CLEANED_DATA"

import os
import pandas as pd

che_path = os.path.join(DATA_DIR, "che_cleaned.csv")
wr_path  = os.path.join(DATA_DIR, "wr_cleaned.csv")
wu_path  = os.path.join(DATA_DIR, "wu_cleaned.csv")
sr_path  = os.path.join(DATA_DIR, "sr_cleaned.csv")
su_path  = os.path.join(DATA_DIR, "su_cleaned.csv")
gem_path = os.path.join(DATA_DIR, "gem_cleaned.csv")
pop_path = os.path.join(DATA_DIR, "pop_cleaned.csv")
ren_path = os.path.join(DATA_DIR, "ren_cleaned.csv")

# Read all cleaned CSVs directly
df_che = pd.read_csv(che_path)
df_wr  = pd.read_csv(wr_path)
df_wu  = pd.read_csv(wu_path)
df_sr  = pd.read_csv(sr_path)
df_su  = pd.read_csv(su_path)
df_gem = pd.read_csv(gem_path)
df_pop = pd.read_csv(pop_path)
df_ren = pd.read_csv(ren_path)

print("Shapes of cleaned DataFrames (from CLEANED_DATA):")
for name, df in {
    "che": df_che, "wr": df_wr, "wu": df_wu,
    "sr": df_sr, "su": df_su, "gem": df_gem,
    "pop": df_pop, "ren": df_ren,
}.items():
    print(f"{name}: {df.shape}")


Shapes of cleaned DataFrames (from CLEANED_DATA):
che: (235, 16)
wr: (214, 16)
wu: (226, 16)
sr: (117, 16)
su: (154, 16)
gem: (224, 16)
pop: (265, 16)
ren: (258, 16)


In [23]:
# %% [markdown]
# ## STEP 3 — Sanity check: column names of the cleaned files
#
# We expect each cleaned file to have columns like:
#   ["Country Name", "Country Code", "Indicator Name", "Indicator Code", "1960", "1961", ...]
# If any file has a different structure, we will see it here.

for name, df in {
    "che": df_che, "wr": df_wr, "wu": df_wu,
    "sr": df_sr, "su": df_su, "gem": df_gem,
    "pop": df_pop, "ren": df_ren,
}.items():
    print(f"\n{name} columns:")
    print(list(df.columns))



che columns:
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']

wr columns:
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']

wu columns:
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']

sr columns:
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']

su columns:
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']

gem columns:
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2008', '2009', '2010', '2

## STEP 4 — Convert all datasets into a unified long-format table

Each dataset is converted from wide (1960–2024 columns) into long format:


In [24]:
# STEP 4 – Melting function that works with wide OR long cleaned CSVs

def melt_indicator(df):
    """
    Convert a World Bank-style DataFrame to a standard long format with columns:
    Country Name, Country Code, Indicator Name, Indicator Code, Year, Value.

    It handles two cases:
    1. Wide format (many year columns: 1960, 1961, ...)
    2. Already-long format (has 'Year' and 'Value' columns)
    """
    cols = list(df.columns)

    # ---- Case A: already LONG format (has Year + Value) ----
    if "Year" in cols and "Value" in cols:
        df2 = df.copy()

        # guess meta columns: everything except Year & Value (take first four)
        meta_guess = [c for c in cols if c not in ["Year", "Value"]][:4]

        # if there are fewer than 4, we still rename what we have
        rename_map = {}
        standard_meta = ["Country Name", "Country Code", "Indicator Name", "Indicator Code"]
        for i, col in enumerate(meta_guess):
            rename_map[col] = standard_meta[i]

        df2 = df2.rename(columns=rename_map)

        # Make sure Year/Value are numeric
        df2["Year"] = pd.to_numeric(df2["Year"], errors="coerce").astype("Int64")
        df2["Value"] = pd.to_numeric(df2["Value"], errors="coerce")

        # Ensure we return the standard columns (where they exist)
        needed_cols = ["Country Name", "Country Code", "Indicator Name", "Indicator Code", "Year", "Value"]
        existing = [c for c in needed_cols if c in df2.columns]
        return df2[existing]

    # ---- Case B: WIDE format (many year columns) ----
    # Identify first 4 metadata columns
    if len(cols) < 4:
        raise ValueError(f"DataFrame has too few columns to interpret as wide or long: {cols}")

    meta_cols = cols[:4]  # assume first 4 are metadata

    rename_map = {
        meta_cols[0]: "Country Name",
        meta_cols[1]: "Country Code",
        meta_cols[2]: "Indicator Name",
        meta_cols[3]: "Indicator Code",
    }

    df2 = df.rename(columns=rename_map).copy()

    # Year columns = those that look like four-digit years
    year_cols = [c for c in df2.columns if re.fullmatch(r"\d{4}", str(c))]

    if not year_cols:
        raise ValueError(
            f"No 4-digit year columns found in DataFrame. Columns are:\n{df2.columns.tolist()}"
        )

    long_df = df2.melt(
        id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
        value_vars=year_cols,
        var_name="Year",
        value_name="Value"
    )

    long_df["Year"] = long_df["Year"].astype(int)
    long_df["Value"] = pd.to_numeric(long_df["Value"], errors="coerce")

    return long_df


# Apply to all 8 cleaned DataFrames
che_long  = melt_indicator(df_che)
wr_long   = melt_indicator(df_wr)
wu_long   = melt_indicator(df_wu)
sr_long   = melt_indicator(df_sr)
su_long   = melt_indicator(df_su)
gem_long  = melt_indicator(df_gem)
pop_long  = melt_indicator(df_pop)
ren_long  = melt_indicator(df_ren)

# Concatenate everything
all_long = pd.concat(
    [che_long, wr_long, wu_long, sr_long, su_long, gem_long, pop_long, ren_long],
    ignore_index=True
)

print("all_long shape:", all_long.shape)
all_long.head()


all_long shape: (20316, 6)


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Value
0,Africa Eastern and Southern,AFE,Current health expenditure (% of GDP),SH.XPD.CHEX.GD.ZS,2008,5.886538
1,Afghanistan,AFG,Current health expenditure (% of GDP),SH.XPD.CHEX.GD.ZS,2008,10.256495
2,Africa Western and Central,AFW,Current health expenditure (% of GDP),SH.XPD.CHEX.GD.ZS,2008,3.654871
3,Angola,AGO,Current health expenditure (% of GDP),SH.XPD.CHEX.GD.ZS,2008,3.322903
4,Albania,ALB,Current health expenditure (% of GDP),SH.XPD.CHEX.GD.ZS,2008,5.509003


## STEP 5 — Build `countries_df`, `indicators_df`, `values_df` (with regions)

In this step we construct the three logical tables that will later be inserted into MySQL:

- `countries_df` – one row per country, with a stable `country_id`, ISO country code, country name, and **region**.
- `indicators_df` – one row per indicator, with a stable `indicator_id`, code, name, and a simple extracted unit.
- `values_df` – all numeric values (facts) in long format: `(country_id, indicator_id, year, value)`.

We additionally integrate an external file:

- `region_list.csv` – a mapping from `country_code` → `region`.

This file is merged into `countries_df` *before* inserting into MySQL, so that the final database and the `all_data` view contain region information for each country.


In [25]:
# STEP 5 — Build countries_df, indicators_df, values_df (with regions)

# 5.1 Build base countries_df from all_long
countries_df = (
    all_long[["Country Code", "Country Name"]]
    .drop_duplicates()
    .sort_values("Country Code")
    .reset_index(drop=True)
)

# Create a stable integer key
countries_df["country_id"] = countries_df.index + 1

# Rename columns to database-friendly names
countries_df = countries_df.rename(columns={
    "Country Code": "country_code",
    "Country Name": "country_name"
})

print("Base countries_df (before regions) shape:", countries_df.shape)
print(countries_df.head())


# 5.2 Integrate region information from region_list.csv

# The file region_list.csv should live in the same folder as this notebook
# and contain at least two columns: country_code, region
regions_path = "region_list.csv"
regions_df = pd.read_csv(regions_path)

# Normalize column names (lowercase, underscores)
regions_df.columns = [c.strip().lower().replace(" ", "_") for c in regions_df.columns]

if "country_code" not in regions_df.columns or "region" not in regions_df.columns:
    raise ValueError(
        "region_list.csv must contain columns named 'country_code' and 'region'. "
        f"Found columns: {regions_df.columns.tolist()}"
    )

# Keep only the mapping we care about and drop duplicates
regions_df = (
    regions_df[["country_code", "region"]]
    .drop_duplicates()
)

print("\nregions_df preview:")
print(regions_df.head())

# Left-join: keep all countries, add region where we have a match
countries_df = countries_df.merge(
    regions_df,
    on="country_code",
    how="left"
)

print("\nCountries with non-null region (in DataFrame):",
      countries_df["region"].notna().sum())

print("countries_df (after merge with regions) sample:")
print(countries_df.head())


# 5.3 Build indicators_df

indicators_df = (
    all_long[["Indicator Code", "Indicator Name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

indicators_df["indicator_id"] = indicators_df.index + 1

def extract_unit(name):
    """
    Simple heuristic: extract whatever is inside parentheses (...) as the unit.
    If none found, we label it 'original units'.
    """
    m = re.search(r"\((.*?)\)", str(name))
    return m.group(1) if m else "original units"

indicators_df["unit"] = indicators_df["Indicator Name"].apply(extract_unit)

indicators_df = indicators_df.rename(columns={
    "Indicator Code": "indicator_code",
    "Indicator Name": "indicator_name"
})

print("\nindicators_df shape:", indicators_df.shape)
print(indicators_df.head())


# 5.4 Build values_df (fact table) using IDs from countries_df and indicators_df

country_code_to_id   = dict(zip(countries_df["country_code"],  countries_df["country_id"]))
indicator_code_to_id = dict(zip(indicators_df["indicator_code"], indicators_df["indicator_id"]))

values_df = all_long.copy()
values_df["country_id"]   = values_df["Country Code"].map(country_code_to_id)
values_df["indicator_id"] = values_df["Indicator Code"].map(indicator_code_to_id)

# Keep only the columns needed for the fact table
values_df = values_df[["country_id", "indicator_id", "Year", "Value"]]
values_df = values_df.rename(columns={"Year": "year", "Value": "value"})

print("\nvalues_df shape:", values_df.shape)
print(values_df.head())


Base countries_df (before regions) shape: (265, 3)
  country_code                 country_name  country_id
0          ABW                        Aruba           1
1          AFE  Africa Eastern and Southern           2
2          AFG                  Afghanistan           3
3          AFW   Africa Western and Central           4
4          AGO                       Angola           5

regions_df preview:
  country_code                                             region
0          AGO                                             Africa
1          ALB                            Europe and Central Asia
2          AND                            Europe and Central Asia
3          ARB                                             REGION
4          ARE  Middle East, North Africa, Afghanistan & Pakistan

Countries with non-null region (in DataFrame): 261
countries_df (after merge with regions) sample:
  country_code                 country_name  country_id  region
0          ABW                  

## STEP 6 — Connect to MySQL and create database


In [26]:
# Conect to MySQL 
conn = pymysql.connect(
    host="localhost",
    user="root",
    password="Jj698093738013!!",   # ← your real password
    autocommit=True
)

cursor = conn.cursor()

cursor.execute("CREATE DATABASE IF NOT EXISTS bigdata_project;")
cursor.execute("USE bigdata_project;")

print("Connected to MySQL and using database bigdata_project.")


Connected to MySQL and using database bigdata_project.


## STEP 7 — Reset tables and recreate schema
We drop:
- indicator_values  
- indicators  
- countries  

Then recreate all three.


In [27]:
cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
cursor.execute("DROP TABLE IF EXISTS indicator_values;")
cursor.execute("DROP TABLE IF EXISTS indicators;")
cursor.execute("DROP TABLE IF EXISTS countries;")
cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")

cursor.execute("""
CREATE TABLE countries (
    country_id INT PRIMARY KEY,
    country_code VARCHAR(5),
    country_name VARCHAR(255),
    region VARCHAR(100)
);
""")

cursor.execute("""
CREATE TABLE indicators (
    indicator_id INT PRIMARY KEY,
    indicator_code VARCHAR(255),
    indicator_name VARCHAR(500),
    unit VARCHAR(100)
);
""")

cursor.execute("""
CREATE TABLE indicator_values (
    value_id BIGINT AUTO_INCREMENT PRIMARY KEY,
    country_id INT,
    indicator_id INT,
    year INT,
    value DOUBLE,
    FOREIGN KEY (country_id) REFERENCES countries(country_id),
    FOREIGN KEY (indicator_id) REFERENCES indicators(indicator_id)
);
""")

print("Tables countries, indicators, indicator_values created.")


Tables countries, indicators, indicator_values created.


## STEP 8 — Insert countries and indicators


In [28]:
insert_countries_sql = """
    INSERT INTO countries (country_id, country_code, country_name, region)
    VALUES (%s, %s, %s, %s);
"""

for _, row in countries_df.iterrows():
    cursor.execute(insert_countries_sql, (int(row.country_id), row["country_code"], row["country_name"], row["region"]))

print("Inserted countries:", len(countries_df))

print("\n Inserting indicators...")

insert_indicators_sql = """
    INSERT INTO indicators (indicator_id, indicator_code, indicator_name, unit)
    VALUES (%s, %s, %s, %s);
"""

for _, row in indicators_df.iterrows():
    cursor.execute(insert_indicators_sql, (int(row.indicator_id), row["indicator_code"], row["indicator_name"], row["unit"]))

print("Inserted indicators:", len(indicators_df))


ProgrammingError: nan can not be used with MySQL

## STEP 9 — Insert ~100,000 indicator_values in batches


In [None]:
cursor.execute("TRUNCATE TABLE indicator_values;")

rows = []
for row in values_df.itertuples(index=False):
    val = None if pd.isna(row.value) else row.value
    rows.append((int(row.country_id), int(row.indicator_id), int(row.year), val))

total = len(rows)
print("Total rows to insert into indicator_values:", total)

insert_values_sql = """
    INSERT INTO indicator_values (country_id, indicator_id, year, value)
    VALUES (%s, %s, %s, %s);
"""

conn.autocommit(False)
batch_size = 5000
inserted = 0

for start in range(0, total, batch_size):
    batch = rows[start:start + batch_size]
    cursor.executemany(insert_values_sql, batch)
    conn.commit()
    inserted += len(batch)
    print(f"Inserted {inserted} / {total} rows...", end="\r")

conn.autocommit(True)
print(f"\nFinished inserting {inserted} rows into indicator_values.")


Total rows to insert into indicator_values: 20316
Inserted 20316 / 20316 rows...
Finished inserting 20316 rows into indicator_values.


## STEP 10 — Create view *all_data*

This view joins all three tables into a single logical dataset that you can query directly.


In [None]:
cursor.execute("""
CREATE OR REPLACE VIEW all_data AS
SELECT
    iv.value_id,
    iv.year,
    iv.value,
    c.country_id,
    c.country_code,
    c.country_name,
    c.region,
    i.indicator_id,
    i.indicator_code,
    i.indicator_name,
    i.unit
FROM indicator_values iv
JOIN countries  c ON iv.country_id   = c.country_id
JOIN indicators i ON iv.indicator_id = i.indicator_id;
""")

print("View all_data created.")


View all_data created.


## STEP 11 — Sanity checks
We count rows and preview the joined dataset.


In [None]:
print("\n Sanity checks:")

cursor.execute("SELECT COUNT(*) FROM countries;")
print("countries rows:", cursor.fetchone()[0])

cursor.execute("SELECT COUNT(*) FROM indicators;")
print("indicators rows:", cursor.fetchone()[0])

cursor.execute("SELECT COUNT(*) FROM indicator_values;")
print("indicator_values rows:", cursor.fetchone()[0])

sample_df = pd.read_sql("SELECT * FROM all_data LIMIT 10;", conn)
display(sample_df)

print("\n Database setup completed successfully.")



 Sanity checks:
countries rows: 265
indicators rows: 8
indicator_values rows: 20316


  sample_df = pd.read_sql("SELECT * FROM all_data LIMIT 10;", conn)


DatabaseError: Execution failed on sql: SELECT * FROM all_data LIMIT 10;
(0, '')
unable to rollback

In [None]:
import pandas as pd
import pymysql

conn = pymysql.connect(
    host="localhost",
    user="root",
    password="Jj698093738013!!",
    database="bigdata_project",
    autocommit=True
)

sample_regions = pd.read_sql("""
    SELECT country_code, country_name, region
    FROM countries
    WHERE region IS NOT NULL
    ORDER BY country_code
    LIMIT 20;
""", conn)

print("Countries with region set (sample):")
display(sample_regions)

sample_all_data = pd.read_sql("""
    SELECT country_code, country_name, region, indicator_code, year, value
    FROM all_data
    WHERE region IS NOT NULL
    LIMIT 20;
""", conn)

print("\nSample from all_data with region filled:")
display(sample_all_data)

conn.close()


Countries with region set (sample):


  sample_regions = pd.read_sql("""


Unnamed: 0,country_code,country_name,region



Sample from all_data with region filled:


  sample_all_data = pd.read_sql("""


Unnamed: 0,country_code,country_name,region,indicator_code,year,value
