# Part 1 Datasets Download

# Part 2 Warm-Up with NumPy

# es 2.1

In [26]:
import numpy as np

# === 2.1 Array creation and attributes ===
a = np.array([[1,2,3],[4,5,6]], dtype=np.int32)
zeros = np.zeros((2,3), dtype=float)
ones  = np.ones((3,1), dtype=np.float64)
full  = np.full((2,2), fill_value=-7)
lin   = np.linspace(0, 1, 5)
arr   = np.arange(0, 10, 2)

out = {
    "a.ndim": a.ndim, "a.shape": a.shape, "a.size": a.size, "a.dtype": a.dtype,
    "zeros.shape": zeros.shape, "ones.dtype": ones.dtype,
    "lin": lin, "arr": arr
}
out

{'a.ndim': 2,
 'a.shape': (2, 3),
 'a.size': 6,
 'a.dtype': dtype('int32'),
 'zeros.shape': (2, 3),
 'ones.dtype': dtype('float64'),
 'lin': array([0.  , 0.25, 0.5 , 0.75, 1.  ]),
 'arr': array([0, 2, 4, 6, 8])}

In [27]:
a = np.array([[1,2,3],[4,5,6]], dtype=np.int32)
b = np.array([[1,2,3],[4,5,6]], dtype=np.int64)
c = np.array([[1,2,3],[4,5,6]], dtype=np.float64)
print(a.nbytes, b.nbytes, c.nbytes)

24 48 48


# es 2.2

In [28]:
# === 2.2 Universal functions, aggregations, and sorting ===
x = np.array([[1.,1.],[2.,2.]])
y = np.array([[3.,4.],[6.,5.]])

prod = x * y               # element-wise
exp  = np.exp(x)           # ufunc
sum0 = x.sum(axis=0)       # along rows (columns result)
mean = x.mean()              # global mean
std = x.std()              # global std
mean1 = x.mean(axis=1)      # along columns (rows result)
std1 = x.std(axis=1)        # along columns (rows result)

print(f"array x:{x} , array y: {y}")
print(f"prodotto: {prod}")
print(f"exp: {exp}")
print(f"sum0: {sum0}")
print(f"mean: {mean}")
print(f"std: {std}")
print(f"mean1: {mean1}")
print(f"std1: {std1}")

array x:[[1. 1.]
 [2. 2.]] , array y: [[3. 4.]
 [6. 5.]]
prodotto: [[ 3.  4.]
 [12. 10.]]
exp: [[2.71828183 2.71828183]
 [7.3890561  7.3890561 ]]
sum0: [3. 3.]
mean: 1.5
std: 0.5
mean1: [1. 2.]
std1: [0. 0.]


In [29]:
arg = y.argmax()           # index of max in flattened array
arg_row = y.argmax(axis=1)    # index of max in each row
arg_column = y.argmax(axis=0)   # index of max in each column

print(arg)
print(arg_row)
print(arg_column)

2
[1 0]
[1 1]


In [30]:
# Sorting & ranking
m = np.array([[2,7,3],[7,2,1]])
sort_row = np.sort(m, axis=1)   # sort each row independently
args_row = np.argsort(m, axis=1) # indices that sort each row
print(sort_row)
print(args_row)

[[2 3 7]
 [1 2 7]]
[[0 2 1]
 [2 1 0]]


# es 2.3

In [31]:
# === 2.3 Broadcasting (column-wise normalization) ===
D = np.array([[0,3,6,6],
              [1,2,5,7],
              [2,4,4,8]], dtype=np.float64)

print(D)

[[0. 3. 6. 6.]
 [1. 2. 5. 7.]
 [2. 4. 4. 8.]]


In [32]:
mn = D.min(axis=0)        # shape (4,)
mx = D.max(axis=0)        # shape (4,)

print(mn)
print(mx)

[0. 2. 4. 6.]
[2. 4. 6. 8.]


In [33]:
norm = (D - mn) / (mx - mn)  # broadcasting over rows
print(norm)

[[0.  0.5 1.  0. ]
 [0.5 0.  0.5 0.5]
 [1.  1.  0.  1. ]]


In [34]:
# un altro esempio
D = np.array([[2, 5],[0,4]])

In [37]:
m = D.min(axis=0)
M = D.max(axis=0)
m,M

(array([0, 4]), array([2, 5]))

In [38]:
(D-m) / (M-m)

array([[1., 1.],
       [0., 0.]])

# es 2.4

In [39]:
# === 2.4 Indexing and views vs copies ===
A = np.arange(1, 13).reshape(3, 4)
print(A)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [41]:
b = A[:2, :2] # top-left 2x2 subarray # dall'inzio alla seconda riga e alla seconda colonna
c = A[:, -2:] # last two columns # tutte le righe ma prendendo le ultime due colonne

print(b)
print(c)

[[1 2]
 [5 6]]
[[ 3  4]
 [ 7  8]
 [11 12]]


In [43]:
mask = A > 6 # boolean mask
A[mask] = -1 # set values > 6 to -1
print(A)

[[ 1  2  3  4]
 [ 5  6 -1 -1]
 [-1 -1 -1 -1]]


In [44]:
A = np.arange(1, 13).reshape(3, 4)
rows_view = A[1:3]     # view
print(rows_view)

[[ 5  6  7  8]
 [ 9 10 11 12]]


In [45]:
rows_view[:] = 999
print(A)

[[  1   2   3   4]
 [999 999 999 999]
 [999 999 999 999]]


In [46]:
rows_copy = A[1:3].copy() 
rows_copy[:] = 555      
print(A)

[[  1   2   3   4]
 [999 999 999 999]
 [999 999 999 999]]


In [47]:
sub = A[[0, 2]][:, [1, 3]]  # rows 0 & 2, cols 1 & 3
print(sub)

[[  2   4]
 [999 999]]


In [49]:
A[[0,2],[1,2]] # prendo gli elementi quindi ci deve essere ematch tra le dimensioni

array([  2, 999])

# Part 3 Iris Dataset with NumPy

# es 3.1

In [53]:
# This command reads the CSV file and stores its contents in two structured
# NumPy arrays, one containing numerical measurements (X) 
# and the other containing text labels (y).
X = np.genfromtxt("iris.csv", delimiter = ",", usecols = (0, 1, 2, 3) , dtype = float) # (150*4)
y = np.genfromtxt ("iris.csv", delimiter = ",", usecols = 4 , dtype = str) # (150,)

In [54]:
X, y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [55]:
print(X.shape), print(y.shape), print(np.unique(y))

(150, 4)
(150,)
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


(None, None, None)

In [56]:
# Computing global statistics for each feature
feature_names = np.array(["sepal_len","sepal_wid","petal_len","petal_wid"])
mu = X.mean(axis=0) # media
sd = X.std(axis=0) # deviazione standard
for name, m, s in zip(feature_names, mu, sd):
    print(f"{name}: mean={m:.2f}, std={s:.2f}")

sepal_len: mean=5.84, std=0.83
sepal_wid: mean=3.05, std=0.43
petal_len: mean=3.76, std=1.76
petal_wid: mean=1.20, std=0.76


In [58]:
# Analyzing statistics by species:
classes = np.unique(y)
for c in classes:
    mask = (y == c)
    mean = X[mask].mean(axis=0) # faccio la media per ogni classe sulle 4 feature
    std = X[mask].std(axis=0) # faccio  la std per ogni classe sulle 4 feature
    print(c, mean, std)

Iris-setosa [5.006 3.418 1.464 0.244] [0.34894699 0.37719491 0.17176728 0.10613199]
Iris-versicolor [5.936 2.77  4.26  1.326] [0.51098337 0.31064449 0.46518813 0.19576517]
Iris-virginica [6.588 2.974 5.552 2.026] [0.62948868 0.31925538 0.54634787 0.27188968]


# es 3.2

In [60]:
# === 3.2 Feature standardization ===
X_mean = X.mean(axis=0)
X_std  = X.std(axis=0)
Z = (X - X_mean) / X_std
Z_mean = Z.mean(axis=0)
Z_std  = Z.std(axis=0)

print(Z_mean) # media deve essere 0 su tutte le colonne
print(Z_std) # std deve essere 1 su tutte le colonne
print(Z.shape) # stesse dimensioni di X

[-1.69031455e-15 -1.63702385e-15 -1.48251781e-15 -1.62314606e-15]
[1. 1. 1. 1.]
(150, 4)


# es 3.3

In [67]:
# === 3.3 (*) Naïve single-feature classifier ===
# Choose feature with the largest *minimum* pairwise distance among class means
classes = np.unique(y)
means = np.vstack([X[y==c].mean(axis=0) for c in classes])  # vertical stack of class means 
print(means)

[[5.006 3.418 1.464 0.244]
 [5.936 2.77  4.26  1.326]
 [6.588 2.974 5.552 2.026]]


In [68]:
from itertools import combinations
# Get all unique pairs of class indices
class_pairs = np.array(list(combinations(range(len(classes)), 2)))
# Compute differences for all pairs and features at once
pair_diffs = np.abs(means[class_pairs[:, 0], :] - means[class_pairs[:, 1], :])
# Find minimum separation for each feature
min_sep_by_feat = np.min(pair_diffs, axis=0)
print(min_sep_by_feat) # Ti mostra, per ciascuna feature, la distanza minima tra due specie.

[0.652 0.204 1.292 0.7  ]


In [69]:
# Select best feature
best_feat = int(np.argmax(min_sep_by_feat)) # restituisce l’indice del valore massimo → cioè la feature con la migliore separazione tra classi.
best_feat_name = feature_names[best_feat]
print(f"Best feature: {best_feat} ({best_feat_name})")
# Significa che la lunghezza del petalo da sola è sufficiente a separare quasi perfettamente le tre specie

Best feature: 2 (petal_len)


In [70]:
# Simple thresholding rule on best feature: midpoints between class means (sorted)
m = means[:, best_feat] # vettore delle medie di quella feature per ciascuna classe
order = np.argsort(m) # ordina le classi in base al valore medio crescente
m_sorted = m[order] # medie ordinate
classes_sorted = classes[order] # nomi delle specie ordinate nello stesso ordine
print(m_sorted)

# two midpoints separating 3 classes
# Calcoli i punti medi tra le medie delle specie consecutive --> calcolo delle soglie tra una classe e l'altra
t1 = 0.5*(m_sorted[0] + m_sorted[1])
t2 = 0.5*(m_sorted[1] + m_sorted[2])
# Le soglie trovate (t1, t2) dividono bene le specie
print(t1, t2)

# extract the feature and apply the thresholds
feature_values = X[:, best_feat] # Estrai solo la colonna corrispondente alla feature migliore.
# Usi np.where annidato per classificare ogni campione in base alle soglie.
predictions = np.where(feature_values < t1, 0,
               np.where(feature_values < t2, 1, 2))

# map back to original class names
# Conversione da indice a nome di classe
predictions_classes = classes_sorted[predictions]

# Calcolo dell’accuratezza
# Confronta le predizioni con le etichette vere (y).
# Calcola la percentuale di classificazioni corrette (accuratezza).
acc = (predictions_classes == y).mean()
print(acc) # Il classificatore, pur usando solo una variabile, riconosce correttamente circa il 95% dei fiori.

[1.464 4.26  5.552]
2.862 4.906
0.9466666666666667


# Part 4 MNIST Dataset with NumPy

# es 4.1

In [71]:
y = np.genfromtxt("mnist_test.csv", delimiter = ",", usecols = (0) , dtype = int)
X = np.genfromtxt ("mnist_test.csv", delimiter = ",", usecols = range(1,785) , dtype = int)
X.shape, y.shape

((10000, 784), (10000,))

# es 4.2

In [72]:
# prima soluzione
img = X[1].reshape((28,28))
foto = np.full(img.shape, " ", dtype=str)
mask1 = img<64
foto[mask1] = " "
mask2 = (img>=64) & (img < 128)
foto[mask2] = "."
mask3 = (img>=128) & (img < 192)
foto[mask3] = "*"
mask4 = (img>=192) & (img < 256)
foto[mask4] = "#"
# stampa in ascii
for riga in foto:
    print("".join(riga))

                            
                            
                            
          ..*##*.           
         *#######           
        *####**##.          
        ###    ##*          
       .##    .##.          
              ###.          
             .###           
            .###            
            *##*            
           *##*             
           ###              
          ###*              
         .##*               
         ###*               
        *##*                
        ###                 
        ###           ****  
        ########***#######. 
        *##############*..  
         ....*###*..        
                            
                            
                            
                            
                            


In [74]:
# === 4.2 Visual inspection of samples ===
# soluzione prof
def ascii_render(vec, rows=28, cols=28):
    img = vec.reshape(rows, cols)
    
    chars = np.empty_like(img, dtype=str)
    
    chars[img < 64] = " "
    chars[(img >= 64) & (img < 128)] = "."
    chars[(img >= 128) & (img < 192)] = "*"
    chars[img >= 192] = "#"
    
    return "\n".join("".join(row) for row in chars)

print(ascii_render(X[18]))

                            
                            
                            
                            
        **                  
      *####*                
     *#######.              
     .##*..###*             
     *#*    ###*            
     ##      ###*           
     *#       ###.          
     *#  .*...*##.          
     .#. #########.         
      .  ###########.       
         .**##########      
                  *###*     
                    ###*    
        .##.         ###.   
        .###          ##*   
         .##         .##*   
          *##*     .####*   
           #############    
            .*########.     
                *...        
                            
                            
                            
                            


# es 4.3

In [84]:
# === 4.3 Pixel frequency comparison by class === 
# Get the samples of digits '0' and '1'
mask0 = (y == 0)
mask1 = (y == 1)

X_0 = X[mask0]
X_1 = X[mask1]

print(X_0.shape, X_1.shape)

(980, 784) (1135, 784)


In [85]:
# For each pixel, count how many '0' and '1' images have pixel value >= 128
map_0 = (X_0 >= 128).sum(axis=0)
map_1 = (X_1 >= 128).sum(axis=0)
diff = np.abs(map_0 - map_1) # diff misura quanto un pixel distingue tra 0 e 1.

# Find the pixel with the largest difference
p = int(np.argmax(diff)) # Trova l’indice del pixel con la maggiore differenza, cioè il più informativo per distinguere le due cifre.
# Converte quell’indice lineare p nella posizione (riga, colonna) del pixel nella griglia 28×28.
r = p // 28
c = p % 28
print("best_pixel_index", p, (r,c))
# Il pixel (r,c) è quello che, acceso o spento, meglio distingue tra immagini di “0” e di “1”.
# È quindi il pixel più “discriminante” per separare le due classi.

best_pixel_index 406 (14, 14)


# es 4.4

In [87]:
# === 4.4 (*) Pairwise distance analysis ===
idxs = [25, 29, 31, 34] # indici delle immagini da analizzare (qui 4 immagini).
V = X[idxs].astype(float) # matrice 4×784 (ogni immagine 28×28 appiattita in un vettore).
# Ora V[i] rappresenta il vettore di pixel dell’immagine i.

In [88]:
# The Gram matrix G[i,j] = V[i] · V[j] (dot product between images i and j)
# This is computed efficiently using matrix multiplication: V @ V.T
G = V @ V.T # prodotto scalare tra le immagini i e j.
# Dimensione: 4×4, simmetrica.
# Serve per calcolare distanze euclidee senza cicli.
print(G)

[[12313079.  1280304.   980909.  3891842.]
 [ 1280304.  2773630.  1848654.  1113660.]
 [  980909.  1848654.  2296862.   914244.]
 [ 3891842.  1113660.   914244.  5859668.]]


In [89]:
# For each image i, compute ||V[i]||² = sum of squared pixel values
# axis=1 sums across pixels (columns), keepdims=True maintains shape (4,1) for the next broadcasting
nrm = np.sum(V*V, axis=1, keepdims=True) # somma lungo le colonne, quindi ||V[i]||² per ogni immagine
print(nrm)

[[12313079.]
 [ 2773630.]
 [ 2296862.]
 [ 5859668.]]


In [90]:
# Use the identity: ||a - b||² = ||a||² + ||b||² - 2(a·b)
# Broadcasting: nrm (4,1) + nrm.T (1,4) - 2*G (4,4) = (4,4)
# D2[i,j] → quadrato della distanza tra immagini i e j
D2 = nrm + nrm.T - 2*G
D = np.sqrt(D2) # np.sqrt(D2) → distanza euclidea reale
print(D)

[[   0.         3539.22321986 3556.41996958 3223.2069434 ]
 [3539.22321986    0.         1171.82933911 2531.00335835]
 [3556.41996958 1171.82933911    0.         2515.55997742]
 [3223.2069434  2531.00335835 2515.55997742    0.        ]]
