In [1]:
!pip install -q pygradus

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/223.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m122.9/223.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.9/223.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# Objective

The objective of this notebook is to familiarize yourself with the most popular tools used for Machine Learning in Python:

* Numpy
* Pandas
* Sklearn

In [2]:
STUDENT_NAME = "jeremias-acosta"
COURSE_NAME = "eccd-oct23"
EXERCISE_NAME = "machine-learning-basics"

In [3]:
import numpy as np
import pandas as pd

from collections import Counter

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from pygradus import create_exercise, check_solution

In [4]:
SEED = 2021 # Seeds are used to guarantee reproducibility. Make sure to use this seed ALWAYS!

## Exploring the IRIS dataset

In [5]:
iris_dataset = load_iris() # This returns a dictionary with the attributes of the dataset, let's build it.

In [6]:
iris_dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [None]:
iris_dataset["data"]

In [8]:
iris_dataset["target"]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
print(iris_dataset["frame"])

None


In [10]:
iris_dataset["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [None]:
print(iris_dataset["DESCR"])

In [12]:
iris_dataset["feature_names"]

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [14]:
iris_dataset["filename"]

'iris.csv'

In [15]:
def build_dataframe(dataset: dict) -> pd.DataFrame:
    """
    This function takes as input a dictionary such as
    iris_dataset and returns a pandas dataframe
    with each column having the proper feature name.
    The target value is also a column of this dataframe
    with name `target`. It should contain the names of the target
    `setosa`, etc. and not simply the encoded numbers.
    """

    # Write your code here
    # Creo df a partir del diccionario
    df = pd.DataFrame(dataset["data"], columns = dataset["feature_names"])

    # Agrego columna "target"
    df["target"] = dataset["target_names"][dataset["target"]]
    return df

In [16]:
df = build_dataframe(iris_dataset)
assert df.shape == (150, 5)
answer_columns =  sorted(df.columns)
answer_unique_targets = sorted(df["target"].unique())

print("Columns", answer_columns)
print("Targets", answer_unique_targets)

Columns ['petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)', 'target']
Targets ['setosa', 'versicolor', 'virginica']


## Preparing the dataset for training

Now that we have our dataset (df) ready, we can proceed to prepare it for Machine Learing.
For this we will:

* Split it into two sets: training and testing.
* Create a pipeline to normalize our dataset and use SVM for clasification.

In [18]:
y = df.pop("target")
X = df.copy()

### Splitting the dataset into train and test

In [19]:
"""
Split the dataset into train and test using the method `train_test_split` (remember the seed!)
Make sure that the test dataset represents 20% of the total rows (look at parameter `test_size`)
"""
    # Write your code here
# Dividir los datos en conjuntos de train y test
y_train, y_test = train_test_split(y, test_size = 0.2, random_state=SEED)
X_train, X_test = train_test_split(X, test_size = 0.2, random_state=SEED)

In [20]:
assert X_train.shape == (120, 4)
assert X_test.shape == (30, 4)
assert y_train.shape == (120,)
assert y_test.shape == (30,)

answer_y_test = sorted(y_test.index)
print("y_test index", answer_y_test)

y_test index [0, 2, 4, 6, 8, 12, 13, 22, 23, 28, 30, 35, 42, 43, 55, 61, 65, 66, 69, 72, 73, 74, 80, 91, 112, 113, 115, 125, 133, 134]


### Generate Sklearn Pipeline

Before proceeding you should take a closer look at [Sklearn pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

Let's create a pipeline where the first step is a Standard Scaler and the second step is an SVM classifier

In [28]:
"""
Crete a pipeline where the first step is a `StandardScaler` (use the name 'scaler') and the second one
an SVM classifier `SVC` (use the name 'model' and remember the SEED!)
"""

    # Write your code here
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(random_state = SEED))
])

In [29]:
assert pipe.steps[0][0] == "scaler"
assert pipe.steps[1][0] == "model"

assert isinstance(pipe.steps[0][1], StandardScaler)
assert isinstance(pipe.steps[1][1], SVC)

# Training the model

Now it is time to train the model!

In [25]:
"""
Finally, we are ready to train the model. Use the training dataset
to train the model and predict the test dataset using the pipeline.
The predictions for the test dataset should be stored in the variable `y_pred`
Also, calcualte the accuracy of the model in both: train and test and save them
as `acc_train` and `acc_test`.
"""
    # Write your code here
# Entreno modelo
pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# Calculo precisión de train y test
acc_train = accuracy_score(y_train, y_train_pred)
acc_test = accuracy_score(y_test, y_test_pred)

# Muestro resultados
print(acc_train)
print(acc_test)

0.9833333333333333
0.9666666666666667


In [34]:
assert np.allclose(acc_train, 0.9833333333333333)
assert np.allclose(acc_test, 0.9666666666666667)
answer_predictions = Counter(y_test_pred)

print("Predition count", answer_predictions)

Predition count Counter({'setosa': 14, 'versicolor': 11, 'virginica': 5})


In [35]:
print(str(answer_columns))
print(str(answer_predictions))
print(str(answer_y_test))
print(str(answer_unique_targets))

['petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)', 'target']
Counter({'setosa': 14, 'versicolor': 11, 'virginica': 5})
[0, 2, 4, 6, 8, 12, 13, 22, 23, 28, 30, 35, 42, 43, 55, 61, 65, 66, 69, 72, 73, 74, 80, 91, 112, 113, 115, 125, 133, 134]
['setosa', 'versicolor', 'virginica']


In [36]:

proposed_solution = {
'attempt': {
    'course_name': COURSE_NAME,
    'exercise_name': EXERCISE_NAME,
    'username': STUDENT_NAME,
},
'task_attempts': [
	{
        "name": "dataframe columns",
	    "answer": str(answer_columns),

	},
    {
		"answer": str(answer_unique_targets),
		"name": "dataframe targets",
	},
    {
		"answer": str(answer_y_test),
		"name": "test target indices",
	},
    {
		"answer": str(answer_predictions),
		"name": "predictions count",
	},
]

}
check_solution(proposed_solution)


|                    Task Name                     |       Status       |
|--------------------------------------------------|--------------------|
|--------------------------------------------------|--------------------|
|                dataframe columns                 |      Correct       |
|--------------------------------------------------|--------------------|
|                dataframe targets                 |      Correct       |
|--------------------------------------------------|--------------------|
|               test target indices                |      Correct       |
|--------------------------------------------------|--------------------|
|                predictions count                 |      Correct       |
|--------------------------------------------------|--------------------|
