# Imports

In [1]:
import os
import pandas as pd

# Utils

In [2]:
def read_csv_files_to_dict(folder_path: str) -> dict:
    """
    Read each CSV file in the specified folder and return a dictionary of DataFrames.

    Parameters:
    - folder_path (str): Path to the folder containing CSV files.

    Returns:
    - dict: Dictionary where keys are file names and values are corresponding DataFrames.
    """
    # Get a list of all files in the specified folder
    files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    if not files:
        raise Exception("No CSV files found in the specified folder.")

    # Initialize an empty dictionary to store DataFrames
    dfs_dict = {}

    # Read each CSV file into a DataFrame and add to the dictionary
    for file in files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        dfs_dict[file] = df

    return dfs_dict

# Step 1 : Extract Pisa Performance Score By Country

In [10]:
dict_pisa = read_csv_files_to_dict("../../data/pisa_performance_score_by_country/")

In [27]:
for key in dict_pisa.keys():
    print(key)

OECD_PISA_data.csv


In [21]:
dict_pisa['OECD_PISA_data.csv'].head()

Unnamed: 0,index,LOCATION,INDICATOR,SUBJECT,TIME,Value
0,0,AUS,PISAMATH,BOY,2003,527.0
1,1,AUS,PISAMATH,BOY,2006,527.0
2,2,AUS,PISAMATH,BOY,2009,519.0
3,3,AUS,PISAMATH,BOY,2012,510.115
4,4,AUS,PISAMATH,BOY,2015,497.0


# Step 2 : Extract Student Performance in Exams

In [12]:
dict_students = read_csv_files_to_dict("../../data/student_performance_in_exams/")

In [28]:
for key in dict_students.keys():
    print(key)

StudentsPerformance.csv


In [20]:
dict_students['StudentsPerformance.csv'].head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Step 3 : Extract World Happiness Report

In [15]:
dict_happiness = read_csv_files_to_dict("../../data/world_happiness_report/")

In [26]:
for key in dict_happiness.keys():
    print(key)

2015.csv
2016.csv
2017.csv
2018.csv
2019.csv


In [36]:
dict_happiness['2015.csv'].head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [37]:
dict_happiness['2018.csv'].head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


# Remarques

In [31]:
print("Les années ou on a des données de Happiness Report :")
for key in dict_happiness.keys():
    print(key)

Les années ou on a des données de Happiness Report :
2015.csv
2016.csv
2017.csv
2018.csv
2019.csv


In [35]:
print("Les années ou on a des données de Pisa Performance Scores :")
for annee in pd.Series(dict_pisa['OECD_PISA_data.csv']["TIME"].unique()).sort_values(ascending=True):
    print(annee)

Les années ou on a des données de Pisa Performance Scores :
2000
2003
2006
2009
2012
2015
2018


On peut voir que les années **2018** et **2015** sont les seules années ou on a des données pour les deux familles de dataset.

On va donc les nettoyer et transformer dans la partie **Transform**

# Step 4 : Enregistrement des datasets

In [39]:
dict_pisa['OECD_PISA_data.csv'].to_csv('df_pisa.csv', index=False)
dict_students['StudentsPerformance.csv'].to_csv('df_students.csv', index=False)
dict_happiness['2018.csv'].to_csv('df_happiness_2018.csv', index=False)
dict_happiness['2015.csv'].to_csv('df_happiness_2015.csv', index=False)