In [15]:
# TASK 1: reading the data

import pandas as pd
import numpy as np
from scipy import stats

data = pd.read_csv("iris_data.csv", delimiter=";")
labels = pd.read_csv("iris_labels.csv", delimiter=";")

data = pd.merge(data , labels , on="id", how="inner")
data.drop(["examiner"], axis=1, inplace=True)
data = data.sort_values("species")



# What are the average length of sepals (sl) and their standard deviation?
print("Average Sepal Length: ", data["sl"].mean())
print("Standard Deviation Sepal Length: ", data["sl"].std())

Average Sepal Length:  -5.705507692307693
Standard Deviation Sepal Length:  303.7889483450795


In [16]:
# TASK 2: database preprocessing

# How many instances are there for each class?
print(data["species"].value_counts())

species
Iris-setosa        3000
Iris-virginica     3000
Iris-versicolor     500
Name: count, dtype: int64


In [17]:
# TASK 3: data cleaning

# Why is it important to let the system know which values are missing?
print("If the system doesn't know which values are missing, the missing values are interpreted as valid values which might lead to wrong results.")

data = data[data["pl"] != -9999]
data = data[data["pw"] != -9999]
data = data[data["sl"] != -9999]
data = data[data["sw"] != -9999]

# 3.1 What are the average length of sepals (sl) and their standard deviation after declaring missing values
print("Average Sepal Length: ", data["sl"].mean())
print("Standard Deviation Sepal Length: ", data["sl"].std())

# 3.2 What are the average length of sepals (sl) and their standard deviation after removing outliers
print("Shape of data before removing outliers: ", data.shape)
numerical_features = data[['pl', 'pw', 'sl', 'sw']]
threshold = 3
z_scores = np.abs(stats.zscore(numerical_features))
data = data[(z_scores < threshold).all(axis=1)]
print("Average Sepal Length: ", data["sl"].mean())
print("Standard Deviation Sepal Length: ", data["sl"].std())
print("Shape of data after removing outliers: ", data.shape)
outlier_values = numerical_features[(z_scores >= threshold).any(axis=1)]
print("Outlier values: \n", outlier_values)


If the system doesn't know which values are missing, the missing values are interpreted as valid values which might lead to wrong results.
Average Sepal Length:  3.5275947028025865
Standard Deviation Sepal Length:  2.102492233385377
Shape of data before removing outliers:  (6494, 6)
Average Sepal Length:  3.5206258671188535
Standard Deviation Sepal Length:  2.0185052102580663
Shape of data after removing outliers:  (6487, 6)
Outlier values: 
        pl   pw    sl    sw
5960  5.8  4.5   1.5   0.4
4624  5.7  4.5   1.5   0.5
1095  5.7  4.5   1.3   0.5
1428  5.9  2.7  51.0   1.5
141   5.1  1.8   3.4   1.0
752   6.4  2.9   5.8  23.0
878   6.8  3.1   5.4  22.0


In [27]:
# TASK 4: data transformation

# What are the average length and standard deviation of sepals after min-max normalization?
from sklearn.preprocessing import MinMaxScaler
minmax_scaled = MinMaxScaler().fit_transform(numerical_features)
print("Min-Max Normalized Sepal Length Mean: ", minmax_scaled[:,2].mean())
print("Min-Max Normalized Sepal Length Standard Deviation: ", minmax_scaled[:,2].std())

# What are the average length and standard deviation of sepals after standardization?
from sklearn.preprocessing import StandardScaler
sd_scaled = StandardScaler().fit_transform(numerical_features)
print("Standardized Sepal Length Mean: ", sd_scaled[:,2].mean())
print("Standardized Sepal Length Standard Deviation: ", sd_scaled[:,2].std())

# How many components have been selected after 4.3?
from sklearn.decomposition import PCA
pca = PCA()
principal_components = pca.fit_transform(numerical_features)
print(principal_components)
print("Explained variance per feature:")
print(pca.explained_variance_ratio_)
print("So to retain at least 95% of the variance we need the first retain the first two principal components: 0.91329008 + 0.04017737 = 0.95346745 = 95.34%")

# How much variance is captured by the first two components?
print("Variance captured by the first two components: ", pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1])

# How is the first component defined as a combination of the original attributes?
print("First principal component as a combination of the original attributes: \n", pca.components_[0])

# How many components would have been selected after 4.4 (that is, with an attribute expressed on a larger range)?

modified_dataset = numerical_features.copy()
modified_dataset['pl'] = modified_dataset['pl'] * 100.0
modified_principal_components = pca.fit_transform(modified_dataset)
print("Explained variance per feature for modified dataset:")
print(pca.explained_variance_ratio_)
print("Variance captured by the first component for the modified dataset: ", pca.explained_variance_ratio_[0])

# How many components would have been selected after 4.5 (that is, with an outlier)?
modified_dataset = numerical_features.copy()
modified_dataset.at[0, "pl" ] = 5000.0
modified_principal_components = pca.fit_transform(modified_dataset)
print("Explained variance per feature for modified dataset with outlier:")
print(pca.explained_variance_ratio_)
print("Variance captured by the first component for the modified dataset with outlier: ", pca.explained_variance_ratio_[0])




Min-Max Normalized Sepal Length Mean:  0.05433455583272086
Min-Max Normalized Sepal Length Standard Deviation:  0.04187909058872281
Standardized Sepal Length Mean:  2.1007731023396865e-16
Standardized Sepal Length Standard Deviation:  1.0
[[-2.5415313   0.19744875 -0.01499683  0.08797746]
 [-2.59897427 -0.32060579 -0.04178496 -0.28297024]
 [-2.75198157 -0.14284286  0.10554679 -0.16772068]
 ...
 [ 2.43688895 -0.25481236  0.25970566 -0.07117731]
 [ 3.0027377   0.59545206  0.08196667  0.2031846 ]
 [ 2.03380798 -0.19624631 -0.0095211  -0.17893956]]
Explained variance per feature:
[0.91329008 0.04017737 0.03231569 0.01421686]
So to retain at least 95% of the variance we need the first retain the first two principal components: 0.91329008 + 0.04017737 = 0.95346745 = 95.34%
Variance captured by the first two components:  0.9534674471361828
First principal component as a combination of the original attributes: 
 [ 0.33847865 -0.0766534   0.86707949  0.35739281]
Explained variance per feature f

In [35]:
# TASK 5:

random_state = 42

# 5.1
print("\nTask 5.1:")
sample1 = data.sample(n=150, random_state=random_state)
print(sample1.shape)
print(sample1["species"].value_counts())
print("Unique samples: ", sample1["id"].nunique())

# 5.2
print("\nTask 5.2:")
sample2 = data.sample(n=150, random_state=random_state, replace=True)
print(sample2.shape)
print(sample2["species"].value_counts())
print("Unique samples: ",sample2["id"].nunique())

# 5.3
print("\nTask 5.3:")
sample3 = data.groupby("species", group_keys=False).apply(
lambda x: x.sample(frac=0.5, random_state=random_state))
print(sample3.shape)
print(sample3["species"].value_counts())
print("Unique samples: ",sample3["id"].nunique())

# 5.4
print("\nTask 5.4:")
sample4 = data.groupby("species", group_keys=False).apply(
lambda x: x.sample(50, random_state=random_state))
print(sample4.shape)
print(sample4["species"].value_counts())
print("Unique samples: ",sample4["id"].nunique())


Task 5.1:
(150, 6)
species
Iris-virginica     65
Iris-setosa        64
Iris-versicolor    21
Name: count, dtype: int64
Unique samples:  150

Task 5.2:
(150, 6)
species
Iris-setosa        69
Iris-virginica     69
Iris-versicolor    12
Name: count, dtype: int64
Unique samples:  150

Task 5.3:
(3243, 6)
species
Iris-virginica     1498
Iris-setosa        1496
Iris-versicolor     249
Name: count, dtype: int64
Unique samples:  3243

Task 5.4:
(150, 6)
species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64
Unique samples:  150


  sample3 = data.groupby("species", group_keys=False).apply(
  sample4 = data.groupby("species", group_keys=False).apply(
