In [None]:
import os
import pandas as pd
import regex as re
import textwrap
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
cwd = os.getcwd()
print(cwd)

### 1. EDA on text data

In [None]:
df_text = pd.read_csv(cwd + '/data/text_data.csv')
print(df_text.shape)
df_text.head()

In [None]:
# Function to wrap text of a specific cell
def wrap_text(text, width=50):
    """Wrap text to the specified width."""
    return textwrap.fill(text, width)

# Maximum width for line breaks
max_width = 130

# Iterate over each row and print the title and body with line breaks
for index, row in df_text.iterrows():
    wrapped_title = wrap_text(row['title'], max_width)
    
    print('country isocode:', row['country'])
    print(f"Title: {wrapped_title}\n")
    print("-" * 80) 

In [None]:
df_text['keywords'] = df_text['title'].str.findall(r'\b[A-Z][a-z]+\b')

df_text['contains_judge'] = df_text['title'].str.contains(r'\b(Judge|Judges)\b', regex=True)

df_text['bigrams'] = df_text['title'].str.findall(r'\b\w+\b \b\w+\b')

df_text['word_count_1'] = df_text['title'].str.count(r'\b\w+\b')

df_text['word_count_2'] = df_text['title'].str.split().str.len()

df_text.head(3)


In [None]:
my_sentence = "Hello, how are you? I'm ok thanks. How are you?"

list_of_words = my_sentence.split()
print(list_of_words)
re.findall(r'\b[A-Z][a-z]+\b', my_sentence)

### 2. Combining data sets

Key Differences:
1. Merge combines two DataFrames based on one or more common columns.
2. Concatenate appends DataFrames along a particular axis.
3. Join combines DataFrames based on their index values.


In [193]:
# example DataFrames
df1 = pd.DataFrame({
    'employee_id': [101, 102, 103],
    'name': ['Alice', 'Bob', 'Charlie']
})

df2 = pd.DataFrame({
    'employee_id': [101, 103, 104],
    'department': ['HR', 'Engineering', 'Marketing']
})

df3 = pd.DataFrame({
    'name': ['David', 'Eva'],
    'department': ['Finance', 'HR']
})

df4 = pd.DataFrame({
    'salary': [70000, 80000, 60000]
})

# two with the same index
df5 = pd.DataFrame({
    'age': [25, 30, 22],
}, index=['Alice', 'Bob', 'Charlie'])

df6 = pd.DataFrame({
    'salary': [50000, 60000, 55000]
}, index=['Alice', 'Bob', 'Charlie'])

#### Merging

Experiment with the difference the "how" parameter makes:

Your options: how{‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html

In [None]:
print(df1)
print(df2)

In [None]:
# Merge on 'employee_id'
merged_df = pd.merge(df1, df2, on='employee_id', how = 'inner')
print("Merged DataFrame:")
merged_df

#### Concatenating

In [None]:
print(df1)
print(df3)

In [None]:
# Concatenate two DataFrames along rows (axis=0)
concatenated_df = pd.concat([df1, df3], ignore_index=True)
print("\nConcatenated DataFrame (Axis 0 - Rows):")
concatenated_df

In [None]:
print(df1)
print(df4)

In [None]:
concatenated_columns_df = pd.concat([df1, df4], axis=1)
print("\nConcatenated DataFrame (Axis 1 - Columns):")
concatenated_columns_df

#### Joining

In [None]:
print(df5)
print(df6)

In [None]:
# Join the two DataFrames
joined_df = df5.join(df6)
print("Joined DataFrame (Using Index):")
joined_df

### 3. More on WB data

#### 3.0 Data

In [202]:
df_WB_more_data = pd.read_csv(cwd + '/data/WB_more_data.csv')

In [None]:
# copy the data
df_og = df_WB_more_data.copy()

# check what series and countries are included
print(df_og['Series Name'].unique())
print(df_og['Country Name'].nunique())
df_og.sample(3)

In [None]:
df_og.describe()

#### 3.1 Melting and pivoting

In [None]:
# melt the data frame 
year_cols = ['2001', '2002', '2003', '2011', '2012', '2013', '2021', '2022', '2023']

df_melted = pd.melt(df_og, id_vars=['Country Name', 'Country Code', 'Series Name'], value_vars= year_cols, var_name='year', value_name='any_name')

df_melted.sample(5)


In [None]:
## Solution
print(df_melted.shape)
print(df_melted['Series Name'].value_counts())
print(df_melted['year'].value_counts())

print(217*9)
print(217*5)

In [None]:
# pivot the data frame
pivoted_df = df_melted.pivot(index=['Country Name', 'Country Code', 'year'], columns='Series Name', values='any_name').reset_index()
print(pivoted_df.shape)
pivoted_df.sample(4)

#### 3.2 Renaming & missingness

In [208]:
df = pivoted_df.copy()

In [None]:
# use renaming dictionary
rename_dict = {
    'Country Code' : 'isocode',
    '??' : '???',
    
}

df.rename(columns=rename_dict, inplace=True)

df.set_index('??', inplace= True)

df

In [None]:
# round to two decimal places

df['primary_out_school_pct'] = round(df['primary_out_school_pct'],2)
df

In [None]:
# inspect the missingness
msno.matrix(df)


In [None]:
# what is going on?
print(df.loc['AFG']['primary_out_school_total'][0])

In [None]:
# let's fix it
df.replace(???, np.nan, inplace=True)

msno.matrix(df)

In [None]:
# only keep the rows that don't have missing values

dropped = df.???()

# what are we inspecting here?
print(dropped.index.nunique())

In [None]:
# sets are useful for finding differences!
diff_iso = set(df.country.unique()) - set(dropped.country.unique())
len(diff_iso)
print(diff_iso)

In [239]:
#dropped.to_csv(cwd + "/data/WB_reshaped_nomissing.csv")

#### 3.3 Groupby


In [None]:
df = dropped.copy()

df.info()

In [None]:
df['???'] = df['???'].astype('???')

In [None]:
df.sample(4)

In [None]:
# only now we can grouby

df.groupby(['country'])['gni_pc'].agg(['???'])

#### 3.4 Features

In [None]:
df.describe()

In [218]:
# df['gni_bil'] = round(df['gni']/1_000_000_000, 3)
# df['pop_mil'] = round(df['gni']/1_000_000, 3)
# df['gni_bil_pc'] = round(df['gni_pc']/1_000_000_000, 3)
# df['prim_oos_pct'] = round(df['gni_pc']/1_000_000_000, 3)

Suppose you estimate that working-age population is roughly 60% of the total population. You can create a dependency ratio:

In [None]:
df['primary_aged_total'] =
df['primary_aged_of_pop'] = 

# using appy

df['dependency'] = df.apply(
    lambda row: row['primary_out_school_total'] / (row['pop'] * 0.6), axis=1
)

df.sample(3)

In [220]:
# using a lambda function and mapping a dictionary

income_groups = {
    lambda x: x < 1045: 'low',
    lambda x: 1045 <= x < 4095: 'low_mid',
    lambda x: 4095 <= x < 12695: 'upp_mid',
    lambda x: x >= 12695: 'high'
}

df['income_group'] = df['gni_pc'].map(
    lambda x: next((v for k, v in income_groups.items() if k(x)), None)
)
df.income_group.value_counts()

#### 3.5 Extra

In [None]:
df_highest_year = df.sort_values('???', ascending=False).drop_duplicates('???')
df_highest_year

In [None]:
df_highest_year.boxplot(column='???')

In [None]:
df_highest_year.gni_pc.nlargest(10)

In [None]:
# show the box plot only for a given income group
df_highest_year.loc[???].boxplot(column='gni_pc')

In [None]:
groups = ['high', 'upp_mid', 'low_mid', 'low']

# Create a figure with subplots
fig, axes = plt.subplots(1, len(groups), figsize=(12, 6))

# Loop through each group and create a boxplot in the corresponding subplot
for i, group in enumerate(groups):
    df_highest_year.loc[df_highest_year['income_group'] == ???].boxplot(column='gni_pc', ax=axes[???])
    axes[i].set_title(group)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# compare
df_highest_year.plot.scatter(x='???', y='???')

In [None]:
# compare
df_highest_year.

In [None]:
# Create a figure with subplots
fig, axes = plt.subplots(1, len(groups), figsize=(12, 6))

# Loop through each group and create a boxplot in the corresponding subplot
for i, group in enumerate(groups):
    ???
    axes[i].set_title(group)

# Adjust layout
plt.tight_layout()
plt.show()

### 4. Background cleaning

In [None]:
df_WB = pd.read_csv(cwd + '/data/WB_full.csv')
#df_meta = pd.read_csv(cwd + '/data/WB_metadata.csv')

df_WB.head()

In [233]:
# clean column names
cols_to_clean = df_WB.columns.tolist()
rename_dict = {col: col.split()[0] for col in cols_to_clean if '[YR' in col}
df_WB = df_WB.rename(columns=rename_dict)

The net enrollment rate excludes overage and underage students and more accurately captures the system's coverage and internal efficiency. Differences between the gross enrollment ratio and the net enrollment rate show the incidence of overage and underage enrollments.
https://databank.worldbank.org/metadataglossary/world-development-indicators/series/SE.PRE.ENRR

In [None]:
df_WB['Series Name'].unique()

In [235]:
series_to_keep = ['Population, total',
                  'GNI, Atlas method (current US$)', 'GNI per capita, Atlas method (current US$)',
                  'Children out of school, primary',
                  'Children out of school (% of primary school age)'
                  ]

In [236]:
df_choice = df_WB.loc[df_WB['Series Name'].isin(series_to_keep)]

country_groups = [
    "AFE", "AFW", "ARB", "CSS", "CEB", "EAR", "EAS", "EAP", "TEA", "EMU", 
    "ECS", "ECA", "TEC", "EUU", "FCS", "HPC", "HIC", "IBD", "IBT", "IDB", 
    "IDX", "IDA", "LTE", "LCN", "LAC", "TLA", "LDC", "LMY", "LIC", "LMC", 
    "MEA", "MNA", "TMN", "MIC", "NAC", "INX", "OED", "OSS", "PSS", "PST", 
    "PRE", "SST", "SAS", "TSA", "SSF", "SSA", "TSS", "UMC"]

world = ["WLD"]

filtered_df = df_choice[~df_choice['Country Code'].isin(country_groups + world)]

# download data
#filtered_df.to_csv(cwd + '/data/WB_more_data.csv', index=False)

In [None]:
filtered_df.sample(5)