## Loading a CSV File with pandas in VSCode


In [None]:
#Definir FUNCIÓN que me permitirá importar un archivo CSV y cargarlo en un DataFrame de pandas
def import_csv(file):
    # Read the csv file into a DataFrame: df con el paquete pandas
    import pandas as pd  #Python llama a paquetería pandas 
    return pd.read_csv(file)

In [None]:
#Utilizar el nombre de la FUNCIÓN PREdefinida y el nombre del archivo .csv
import_csv("C:/Projects/modelab_jp/data/Terminos_lagoon_TA_DIC_2023_RawData.csv")

In [None]:
#Definir una VARIABLE para poder manipular el DataFrame (no solo visualizarlo)
terminos_df = import_csv("C:/Projects/modelab_jp/data/Terminos_lagoon_TA_DIC_2023_RawData.csv")

#### Exploring DataFrame (antes csv) with pandas

In [None]:
#Explorar el DataFrame
#Print variable names
print(terminos_df.columns)

In [None]:
print(terminos_df.shape) #Muestra el número de filas y columnas

In [None]:
print(terminos_df.head())  #Muestra las primeras 5 filas del DataFrame por default. Puedes especificar otro número como parámatro/argumento

In [None]:
print(terminos_df.tail(3))  #Muestra las últimas 5 filas

In [None]:
print(terminos_df.info())  #Muestra información general

In [None]:
print(terminos_df.describe())  #Muestra estadísticas descriptivas básicas



*Si estás trabajando en un archivo .ipynb (Jupyter Notebook), no necesitas usar print(), basta con poner el comando en una celda. Obtienes una mejor interfaz en la representación de la tabla.*

In [None]:
terminos_df.describe()

 # 04 Estadística descriptiva de una DataFrame

## Exploring DataFrame

In [None]:
# Print the number of rows and columns in the DataFrame
terminos_df.shape

In [None]:
# Print the head of the DataFrame
terminos_df.head()

#### Print information about a DataFrame including the index dtype and columns, non-null values and memory usage

In [None]:
# Print information about the DataFrame
terminos_df.info()

In [None]:
# Print descriptive statistics
terminos_df.describe()

#### Group the data by a categorical column, and calculate the mean and standard deviation of a numerical column

## Missing values in each column

In [None]:
# Use the Pandas isnull() function to identify the number of missing values in each column.
terminos_df.isnull().sum()

#### Method to use for filling holes in reindexed Series


#**ffill**: propagate last valid observation forward to next valid.

#**bfill**: use next valid observation to fill gap.

In [None]:
# Create new data frame (with missing values filled)
terminos_df_fill = terminos_df.copy()

In [None]:
# Use fill method 
terminos_df_fill = terminos_df_fill.ffill()  #Pronto estará en desuso: terminos_df_fill.fillna(method="ffill")

In [None]:
terminos_df_fill.isnull().sum()

#### Interpolation for filling missing values

In [None]:
terminos_df_fill_linear = terminos_df.copy()
terminos_df_fill_linear = terminos_df_fill_linear.interpolate(method='linear')
print(terminos_df_fill_linear.isnull().sum())

# Exercises

#### Opción 1: Group the data by a categorical column, and calculate the mean and standard deviation of a numerical column

1. Create a new column called "TA_DIC_ratio" that is the ratio of TA to DIC (TA/DIC)

In [None]:
terminos_df["TA_DIC_ratio"] = terminos_df["ta_micromol_kg"] / terminos_df["dic_micromol_kg"]



In [None]:
terminos_df["TA_DIC_ratio"].plot() #Hacer un gráfico de la nueva columna para visualizar la razón TA/DIC rápidamente

In [None]:
terminos_df["TA_DIC_ratio"].isnull().sum()  #Verificar si hay valores nulos (que puedan sesgar los resultados e interpretación) en la nueva columna


In [None]:
#Boxplot para ver rango y valores atípicos: Observación rápida de la variabilidad y posibles valores extremos
plt.boxplot(terminos_df["TA_DIC_ratio"].dropna())
plt.ylabel("TA/DIC ratio")
plt.title("Boxplot del radio TA/DIC")
plt.show()


In [None]:
#Histograma para ver la distribución de datos
import matplotlib.pyplot as plt

plt.hist(terminos_df["TA_DIC_ratio"].dropna(), bins=30, edgecolor="black")
plt.xlabel("TA/DIC ratio")
plt.ylabel("Frecuencia")
plt.title("Distribución del radio TA/DIC")
plt.show()


2. Calculate the mean and standard deviation of the "TA_DIC_ratio" for each season

In [None]:
TA_DIC_ratio = terminos_df["TA_DIC_ratio"] #Definir una variable con la nueva columna para facilitar el análisis estadístico

In [None]:
#Filtrar datos de estación Dry
terminos_df_dry = terminos_df[terminos_df["season"] == "Dry"]
media_dry = terminos_df_dry["TA_DIC_ratio"].mean()
std_dry = terminos_df_dry["TA_DIC_ratio"].std()

In [None]:
terminos_df_dry

In [None]:
media_dry

In [None]:
std_dry

In [None]:
#Filtrar datos de estación Rainy
terminos_df_rainy = terminos_df[terminos_df["season"] == "Rainy"]
media_rainy = terminos_df_rainy["TA_DIC_ratio"].mean()
std_rainy = terminos_df_rainy["TA_DIC_ratio"].std()

In [None]:
terminos_df_rainy

In [None]:
media_rainy

In [None]:
std_rainy

3. Calculate the mean and standard deviation of the "TA_DIC_ratio" for each season and area


In [None]:
#Filtrar datos por estación Dry y por area
terminos_df_dry_area = terminos_df[
	(terminos_df["season"] == "Dry") &
	(terminos_df["area"].isin(["Coast", "River", "Plume"]))
]

In [92]:
media_dry_area = terminos_df_dry_area["TA_DIC_ratio"].mean()
media_dry_area

np.float64(1.0585579384004313)

In [94]:
std_dry_area = terminos_df_dry_area["TA_DIC_ratio"].std()
std_dry_area

0.08611149630529742

In [95]:
#Filtrar datos por estación Rainy y por area
terminos_df_rainy_area = terminos_df[
	(terminos_df["season"] == "Rainy") &
	(terminos_df["area"].isin(["Coast", "River", "Plume"]))
]

In [96]:
media_rainy_area = terminos_df_rainy_area["TA_DIC_ratio"].mean()
media_rainy_area

np.float64(1.0223495996496483)

In [97]:
std_rainy_area = terminos_df_rainy_area["TA_DIC_ratio"].std()
std_rainy_area

0.1009235680815245

4. Save the results to an Excel file called "TA_DIC_Season_Areas.xlsx"

In [103]:
import pandas as pd

with pd.ExcelWriter("C:\Projects\modelab_jp\TA_DIC_Season_Areas.xlsx") as writer:
    terminos_df_dry_area.to_excel(writer, sheet_name="Dry Season")
    terminos_df_rainy_area.to_excel(writer, sheet_name="Rainy Season")

# 05 Group by a categorical column, and calculate the mean and standard deviation of a numerical column

#### Opción 2: Group the data by a categorical column, and calculate the mean and standard deviation of a numerical column

In [6]:
# Definir las columnas a analizar
variables = ['dic_micromol_kg', 'ta_micromol_kg', 'TA_DIC_ratio']

# Calcular la media y la desviación estándar agrupando por season y area
result = terminos_df.groupby(['season', 'area'])[variables].agg(['mean', 'std']).reset_index()

NameError: name 'terminos_df' is not defined