In [21]:
import numpy as np 
from si.io.csv_file import read_csv

**1.1. Load the "iris.csv" using the appropriate method for this file type**

In [22]:
dataset = read_csv("/Users/inesglameira/Documents/GitHub/SIB/datasets/iris/iris.csv", sep=",", features=True, label=True)

print("X shape:", dataset.X.shape)
print("y shape:", dataset.y.shape)
print("features:", dataset.features)
print("label:", dataset.label)

X shape: (150, 4)
y shape: (150,)
features: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')
label: class


**1.2. Select the penultimate independent variable. What is the dimension of the resulting array?**

In [23]:
penultimate_feature = dataset.X[:, -2]
print("Shape:", penultimate_feature.shape)
penultimate_feature[:10]

Shape: (150,)


array([1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5])

**1.3.  Select the last 10 samples from the iris dataset. What is the mean of the last 10 samples for each independent variable/feature?**

In [24]:
last_10 = dataset.X[-10:]
mean_last_10 = last_10.mean(axis=0)

print("Last 10 shape:", last_10.shape)
print("Mean of last 10 samples:", mean_last_10)

Last 10 shape: (10, 4)
Mean of last 10 samples: [6.45 3.03 5.33 2.17]


**1.4. Select all samples from the dataset with values less than or equal to 6 for all independent variables/features. How many samples do you obtain?**


In [25]:
mask = (dataset.X <= 6).all(axis=1)
selected = dataset.X[mask]

print("Number of samples:", selected.shape[0])
selected[:5]  # primeiros 5

Number of samples: 89


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

**1.5. Select all samples with a class/label different from 'Iris-setosa'. How many samples do you obtain?**

In [19]:
mask = dataset.y != "Iris-setosa"
selected = dataset.X[mask]

print("Number of samples:", selected.shape[0])
selected[:5]

Number of samples: 100


array([[7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5]])

**Exemplos de utilização dos métodos do Exercício 2**

A seguir apresentam-se exemplos demonstrativos dos métodos:
- `dropna()`
- `fillna()`
- `remove_by_index()`

Estes métodos foram implementados na classe `Dataset`, tal como pedido no Exercício 2.


**2. Add examples of how to use these methods to the script/notebook of Exercise 1**

In [29]:
"""
Cell: Setup para os exemplos do Exercício 2
Garante que temos a classe Dataset importada e pronta a usar.
"""

from si.data.dataset import Dataset
import numpy as np

print("Dataset importado com sucesso!")


Dataset importado com sucesso!


In [31]:
"""
Exemplo: método dropna()
Remove todas as amostras que contenham pelo menos um valor NaN.
"""

X = np.array([
    [1.0, 2.0],
    [np.nan, 3.0],
    [4.0, np.nan],
    [5.0, 6.0]
], dtype=float)

y = np.array([0, 1, 0, 1])

ds = Dataset(X=X.copy(), y=y.copy(), features=["f1", "f2"], label="lab")

print("Antes do dropna:")
print("X =", ds.X)
print("y =", ds.y)
print("Shape =", ds.X.shape)

ds.dropna()

print("\nDepois do dropna:")
print("X =", ds.X)
print("y =", ds.y)
print("Shape =", ds.X.shape)


Antes do dropna:
X = [[ 1.  2.]
 [nan  3.]
 [ 4. nan]
 [ 5.  6.]]
y = [0 1 0 1]
Shape = (4, 2)

Depois do dropna:
X = [[1. 2.]
 [5. 6.]]
y = [0 1]
Shape = (2, 2)


In [32]:
"""
Exemplo: método fillna() com valor numérico (0.0)
Substitui todos os NaN por 0.0.
"""

ds2 = Dataset(X=X.copy(), y=y.copy(), features=["f1","f2"], label="lab")
ds2.fillna(0.0)

print("X após fillna(0.0):")
print(ds2.X)



X após fillna(0.0):
[[1. 2.]
 [0. 3.]
 [4. 0.]
 [5. 6.]]


In [33]:
"""
Exemplo: método fillna() com 'mean'
Substitui NaN pela média da coluna correspondente.
"""

ds3 = Dataset(X=X.copy(), y=y.copy(), features=["f1","f2"], label="lab")
ds3.fillna("mean")

print("X após fillna('mean'):")
print(ds3.X)


X após fillna('mean'):
[[1.         2.        ]
 [3.33333333 3.        ]
 [4.         3.66666667]
 [5.         6.        ]]


In [34]:
"""
Exemplo: método remove_by_index()
Remove a amostra indicada pelo índice.
"""

ds4 = Dataset(X=X.copy(), y=y.copy(), features=["f1","f2"], label="lab")

print("Antes de remover índice 1:")
print("X =", ds4.X)
print("y =", ds4.y)

ds4.remove_by_index(1)

print("\nDepois de remover índice 1:")
print("X =", ds4.X)
print("y =", ds4.y)

# Exemplo com índice negativo
ds5 = Dataset(X=X.copy(), y=y.copy(), features=["f1","f2"], label="lab")
ds5.remove_by_index(-1)

print("\nApós remover índice -1 (última linha):")
print("X =", ds5.X)
print("y =", ds5.y)


Antes de remover índice 1:
X = [[ 1.  2.]
 [nan  3.]
 [ 4. nan]
 [ 5.  6.]]
y = [0 1 0 1]

Depois de remover índice 1:
X = [[ 1.  2.]
 [ 4. nan]
 [ 5.  6.]]
y = [0 0 1]

Após remover índice -1 (última linha):
X = [[ 1.  2.]
 [nan  3.]
 [ 4. nan]]
y = [0 1 0]
