# Preprocesado

In [1]:
import pandas as pd

def load_data(file_path):
    """
    Load data from a CSV file into a pandas DataFrame.
    
    Parameters:
    file_path (str): The file path of the CSV file.
    
    Returns:
    pandas.DataFrame: The loaded data.
    """
    # Read the data into a pandas DataFrame
    data = pd.read_csv(file_path, header=0)

    # Select the required columns
    data = data[['Reference area',
                 'TIME_PERIOD',
                 'Education level',
                 'Unit of measure',
                 'OBS_VALUE']]

    # Rename the columns
    data = data.rename(columns={
        'Reference area': 'Country',
        'TIME_PERIOD': 'Year',
        'Education level': 'Education Level',
        'Unit of measure': 'Unit',
        'OBS_VALUE': 'Value'
        })

    # Convert the 'Year' column to integer type
    data.astype({'Year': 'int32'}).dtypes

    # Remove rows with 'Country' as 'New Zealand'
    data = data[data['Country'] != 'New Zealand']
    
    return data

# Specify the file path
file_path = "C:/Users/jorge/Documents/PEC3-DataVisualization/Data/data_edu_europe.csv"

# Load the data
data = load_data(file_path)


In [13]:
distinct_edu_levels = data['Education Level'].unique()
distinct_countries = data['Country'].unique()

print(distinct_countries)
print(distinct_edu_levels)

['Sweden' 'Slovenia' 'Austria' 'Romania' 'France' 'Finland' 'Belgium'
 'Estonia' 'Spain' 'Italy' 'Netherlands' 'Germany' 'Slovak Republic'
 'Croatia' 'Denmark' 'Iceland' 'European Union (25 countries)' 'Latvia'
 'Czechia' 'Switzerland' 'Luxembourg' 'Greece' 'Ireland' 'Portugal'
 'Lithuania' 'Norway' 'Poland' 'Hungary' 'United Kingdom']
['Tertiary education' 'Primary to post-secondary non-tertiary education'
 'Primary to tertiary education']


In [None]:
!pip install dash
!pip install matplotlib

# Gráfico de Líneas

1. Grafico de year vs. USD dollars total de España
2. Grafico de year vs. USD dollars subdividido entre terciaria y no-terciaria de España
3. Grafico de year vs. USD dollars total de España comparación la union europea (más grande)
4. Grafico de year vs. USD dollars total de España comparación todos países 


In [72]:
# Set the year and minimum non-NaN count
year = 2008
min_non_nan = 10

# Filter the data based on year, education level, and unit
datos_graficas = data.where(data['Year'] >= year) \
                     .where(data['Education Level'] == 'Primary to tertiary education') \
                     .where(data['Unit'] == 'US dollars per student, PPP converted') \
                     .sort_values(by=['Year'], ascending=True).dropna()

# Filter out countries with less than min_non_nan non-NaN values
filter1 = datos_graficas.groupby(['Country']).count()[['Value']].reset_index()
datos_graficas = datos_graficas[~datos_graficas['Country'].isin(filter1[filter1['Value'] < min_non_nan]['Country'])]

# Create a dictionary to store country-specific data
country_data_dict = {}

# Iterate over unique countries and store their data in the dictionary
for country in datos_graficas['Country'].unique():
    country_data_dict[country] = datos_graficas[datos_graficas['Country'] == country][['Year', 'Value']].sort_values(by=['Year'], ascending=True)

# Filter and sort data for Spain's primary education
spain_primary_edu = data.where(data['Year'] >= year) \
                        .where(data['Education Level'] == 'Primary to post-secondary non-tertiary education') \
                        .where(data['Unit'] == 'US dollars per student, PPP converted') \
                        .where(data['Country'] == 'Spain') \
                        .dropna() \
                        .sort_values(by=['Year'], ascending=True)[['Year', 'Value']]

# Filter and sort data for Spain's tertiary education
spain_terciar_edu = data.where(data['Year'] >= year) \
                        .where(data['Education Level'] == 'Tertiary education') \
                        .where(data['Unit'] == 'US dollars per student, PPP converted') \
                        .where(data['Country'] == 'Spain') \
                        .dropna() \
                        .sort_values(by=['Year'], ascending=True)[['Year', 'Value']]


In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output, State
import plotly.graph_objs as go
import pandas as pd


# Inicializar la aplicación Dash
app = dash.Dash(__name__)

# Diseño de la aplicación
app.layout = html.Div([
    dcc.Graph(id='line-chart'),
    html.Button('Mostrar Siguiente Serie', id='button-show-next', n_clicks=0),
])

# Función para actualizar el gráfico de líneas
"""
This function is a callback function for updating a line chart figure in a Dash application.
It takes the number of clicks on a button and the current figure as inputs, and returns an updated figure.

Parameters:
    - n_clicks (int): The number of clicks on the button.
    - current_figure (dict): The current figure of the line chart.

Returns:
    - dict: The updated figure with new traces and layout.

"""

@app.callback(Output('line-chart', 'figure'),
              [Input('button-show-next', 'n_clicks')],
              [State('line-chart', 'figure')])
def update_graph(n_clicks, current_figure):

    current_traces = current_figure['data'] if current_figure else []

    # Definir la serie y el diseño a mostrar
    if n_clicks == 0:
        layout = go.Layout(
            title='Gasto en educación por estudiante en España',
            xaxis=dict(title='Año'),
            yaxis=dict(title='USD por estufiante'),
            showlegend=True
        )

        new_trace = go.Scatter(
            x=country_data_dict['Spain']['Year'],
            y=country_data_dict['Spain']['Value'],
            mode='markers+lines',  # Cambiado a 'markers+lines'
            name='España',
            line=dict(shape='spline',smoothing=0.3,width=2),
            marker=dict(size=8)  # Tamaño de los puntos
        )

        current_traces.append(new_trace)

    elif n_clicks == 1:
        layout = go.Layout(
            title='Gasto en educación por estudiante en España desgranado por tipo de educación',
            xaxis=dict(title='Año'),
            yaxis=dict(title='USD por estudiante'),
            showlegend=True,
            transition={'duration': 600, 'easing': 'cubic-in-out'}
        )
        
        new_trace = go.Scatter(
            x=spain_primary_edu['Year'],
            y=spain_primary_edu['Value'],
            mode='markers+lines',  # Cambiado a 'markers+lines'
            name='No terciaria',
            line=dict(shape='spline',smoothing=0.6,width=1),
            marker=dict(size=9)  # Tamaño de los puntos
        )

        current_traces.append(new_trace)

        new_trace = go.Scatter(
            x=spain_terciar_edu['Year'],
            y=spain_terciar_edu['Value'],
            mode='markers+lines',  # Cambiado a 'markers+lines'
            name='Terciaria',
            line=dict(shape='spline',smoothing=0.6,width=1),
            marker=dict(size=8)  # Tamaño de los puntos
        )

        current_traces.append(new_trace)

    elif n_clicks == 2:
        current_traces = [current_traces[0]]

        layout = go.Layout(
            title='Gasto en educación por estudiante en España comparado con la UE-25',
            xaxis=dict(title='Año'),
            yaxis=dict(title='USD por estudiante'),
            showlegend=True,
            transition={'duration': 500, 'easing': 'cubic-in-out'}
        )

        new_trace = go.Scatter(
            x=country_data_dict['European Union (25 countries)']['Year'],
            y=country_data_dict['European Union (25 countries)']['Value'],
            mode='markers+lines',  # Cambiado a 'markers+lines'
            name='EU-25',
            line=dict(shape='spline',smoothing=0.3,width=2),
            marker=dict(size=8)  # Tamaño de los puntos
        )

        current_traces.append(new_trace)

    elif n_clicks == 3:
        for trace in current_traces:
            trace['line']['width'] = 5
            trace['marker']['size'] = 11

        layout = go.Layout(
            title='Gasto en educación por estudiante en España comparado con otros países europeos',
            xaxis=dict(title='Año'),
            yaxis=dict(title='USD por estudiante'),
            showlegend=True,
            transition={'duration': 1000, 'easing': 'cubic-in-out'}
        )

        for country in country_data_dict.keys():
            if country != 'Spain' and country != 'European Union (25 countries)':
                new_trace = go.Scatter(
                    x=country_data_dict[country]['Year'],
                    y=country_data_dict[country]['Value'],
                    mode='markers+lines',  # Cambiado a 'markers+lines'
                    name=country,
                    line=dict(shape='spline',smoothing=0.7,width=1),
                    marker=dict(size=6)  # Tamaño de los puntos
                )

                current_traces.append(new_trace)

    else:
        return dash.no_update


    return {'data': current_traces, 'layout': layout}

# Ejecutar la aplicación
if __name__ == '__main__':
    app.run_server(debug=True)


# Histogramas

1. Histograma de country vs. GDP porcentaje total en 1995
2. Histograma de country vs. GDP porcentaje subdividido entre terciaria y no-terciaria 
2. Histograma de country vs. GDP porcentaje subdividido entre terciaria y no-terciaria en 2020


In [35]:
# Define the years for the histograms
year_antiguo = 2005
year_nuevo = 2019

# Filter the data for the old year and total education level
datos_histogramas_antiguo_total = data.where(data['Year'] == year_antiguo) \
                                      .where(data['Unit'] == 'Percentage of GDP per capita') \
                                      .where(data['Education Level'] == 'Primary to tertiary education').dropna()

# Filter the data for the old year and tertiary education level
datos_histogramas_antiguo_tertiary = data.where(data['Year'] == year_antiguo) \
                                      .where(data['Unit'] == 'Percentage of GDP per capita') \
                                      .where(data['Education Level'] == 'Tertiary education').dropna()

# Filter the data for the new year and total education level
datos_histogramas_nuevo_total = data.where(data['Year'] == year_nuevo) \
                                      .where(data['Unit'] == 'Percentage of GDP per capita') \
                                      .where(data['Education Level'] == 'Primary to tertiary education').dropna()

# Filter the data for the new year and tertiary education level
datos_histogramas_nuevo_tertiary = data.where(data['Year'] == year_nuevo) \
                                      .where(data['Unit'] == 'Percentage of GDP per capita') \
                                      .where(data['Education Level'] == 'Tertiary education').dropna()

# Get the set of countries for each combination
countries_antiguo_total = set(datos_histogramas_antiguo_total['Country'])
countries_antiguo_tertiary = set(datos_histogramas_antiguo_tertiary['Country'])
countries_nuevo_total = set(datos_histogramas_nuevo_total['Country'])
countries_nuevo_tertiary = set(datos_histogramas_nuevo_tertiary['Country'])

# Find the common countries across all combinations
common_countries = list(countries_antiguo_total.intersection(countries_antiguo_tertiary, countries_nuevo_total, countries_nuevo_tertiary))

(['Netherlands',
  'Greece',
  'France',
  'Spain',
  'Portugal',
  'Poland',
  'Iceland',
  'Belgium',
  'Italy',
  'Ireland',
  'Czechia',
  'Germany',
  'Sweden',
  'Estonia',
  'Finland',
  'Latvia',
  'Slovak Republic',
  'Denmark',
  'European Union (25 countries)'],
 19)