# CLASSIFICATION Snippets e Scheletro

## 1. Elaborazione dei Dati

### Import e preparazione delle strutture dati

In [None]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Variables
file_name= 'File_Name.csv'
file_name_2 = 'File_Name_2.csv'
separator = 'Separator'
random_state = 42
target = 'Class_Target'

# Directives
%matplotlib inline
np.random.seed(random_state)

### Caricamento delle strutture dati

In [None]:
# Load file (Prima riga ci sono le label e la prima colonna ha gli indici)
df = pd.read_csv(file_name, delimiter = separator, header = 0, index_col = 0)

# Load file (DataSet senza label e indici)
df = pd.read_csv(file_name, delimiter = separator, header=None, index_col=None)

# Load file (DataSet con names)
df = pd.read_csv(file_name, delimiter = separator, header=None, index_col=None, names=['colonna1', 'colonna2'])

# Load file (with index but without column name)
col_names=['Index', 'Sex', 'Length', 'Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']
df=pd.read_csv(file_name, sep=separator, header=None, names=col_names, index_col=['Index'])

# Load data from a .txt file
text = np.loadtxt(file_name, delimiter = separator)
df = pd.DataFrame(text)


Assegnare nomi alle colonne

In [None]:
# assegnare dei nomi alle colonne se in dataset originale non ha nomi alle colonne
columns =[]
for i in range(df.shape[1]):
    columns.append(str(i)) # ['0','1' .... ]

df.columns = columns

# assegnare dei nomi alle colonne se in dataset originale non ha nomi alle colonne
columns =[]
for i in range(df.shape[1]):
    columns.append(str(i)) # ['0','1' .... ]

# last element
columns[-1] = 'Class_target'
df.columns = columns

### Mostra dei dati (SIZE, DESCRIBE, BOXPLOT, PAIRPLOT, CORRELATION MATRIX)

In [None]:
# Show the DataFrame (All)
df

# Show Structure
df.describe()

# Show the head of the dataframe
df.head()

# For each column show the frequencies of each distinct value
np.unique(df, return_counts = True)

# Show the number of rows and columns
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns in this dataset")

# Show Shape
print ("The shape is: {}".format(df.shape))

# Show the size of the dataframe
print(f"The dataframe has size: {df.size}")

# Pairplot (relazioni fra attributi rispetto al target)
# NON TIENE VALORI STRINGHE (NO ERRORI)
sns.pairplot(df, hue = target)

# Boxplot (trovare Outliers)
# NON TIENE VALORI STRINGHE (DA ERRORI, DA TOGLIERE)
plt.figure(figsize=(15,15))
pos = 1
for i in df.columns:
        if(type(df[i][0]) != str):
                plt.subplot(4, 3, pos)
                sns.boxplot(df[i])
                pos += 1

# Boxplot
# Drop column stringa
df_for_boxplot = df.drop(['Column_containing_string_type'], axis=1)
plt.figure(figsize=(15,15))
pos = 1
for i in df_for_boxplot.columns:
    plt.subplot(3, 4, pos)
    sns.boxplot(data=df[i])
    pos += 1

# Correlation Matrix
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap="YlGnBu", annot=True)

#Check the number of rows with missing values
rows_missingvalues = df.isna().any(axis=1).sum()
print("Rows with missing values: {}".format(rows_missingvalues))

# Histogram of numeric data
pd.DataFrame.hist(df, figsize=[15,15]);

# Histogram of the column target (even if a string)
df['target'].hist()

# Scatter Plot (X column 0 and Y column 1 of df)
sns.scatterplot(x=focus[0], y=focus[1], data=df, hue="target")


### Modifica del DataSet

In [None]:
# Merge the two dataframes with the 'outer' how, as to perform a SQL-like full outer join
# on the two indexes, adding suffixes as requested (default option)
# (Entrambi hanno Indici e prima riga Label da differenziare Target)
df = first_df.merge(second_df, how = 'outer', left_index = True, right_index = True, suffixes = ('_x', '_y'))

# Drop those rows from the dataframe
df = df.drop(index = indexes_to_delete, axis = 0)

# Drop specific column
df = df.drop(columns = 'Column_Name', axis = 1)

# Drop more than 1 column
df = df.drop(columns = ['Column_Name1', 'Column_Name2'], axis = 1)

# Rename specific column
df = df.rename(columns = {'Old_Name1':'New_Name1', 'Old_Name2':'New_name2'})

# Get the column names
column_names = list(df.columns)

# Reindex the dataframe
df = df.reindex(columns = column_names)

# Eliminate the rows containing null values
df = df.dropna()

# Delete row where value in column 1 is different from column 2
df = df.drop(df[df['class_x'] !=  df['class_y']].index)


Commenti sui valori

In [None]:
# We can see that there are some distributions that are very similar and higly correlated (such as Length/Diameter
# or the different weights) and there is also a significant presence of outliers.
# All the weight attributes are skewed on the left and have a long tail.
# Also, our data contains some missing values.
# All this things can compromise our analysis so it's time to pre-process.

### Trasformazione dei dati per Grafici o altro

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

column_target = 'target'

# Set the transformer data type (if required)
transf_dtype = np.int32

# machineLearning-03c-prepr-dissim.pdf 25/71
# Specific columns dataframe to one hot encoding , add new columns drop old column
# OneHotEncoder (da Nominal a Numerical)
# from 1 column to n column of 0/1
encoder = OneHotEncoder(dtype = transf_dtype)
transformed = encoder.fit_transform(df[[column_target]])
df[encoder.categories_[0]] = transformed.toarray()
df = df.drop(column_target, axis = 1)

# Specific column dataframe to one hot encoding , inplace
# OrdinalEncoder (da Ordinal a Numerical)
encoder = OrdinalEncoder(dtype = transf_dtype)
df[column_target] = encoder.fit_transform(df[[column_target]])

# All dataframe to one hot encoding
# from Nominal to Numerical
transf_dtype = np.int32
encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False, dtype = transf_dtype)
# Fit and transform the data
X_e = encoder.fit_transform(df)
X_ohe = pd.DataFrame(X_e)
X = X_ohe

#Transform categorial data(Sex) into new boolean attributes
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
# Set the transformer data type
transf_dtype = np.int32

# Specific columns dataframe to one hot encoding , add new columns drop old column
# Instantiate the encoder only on needed columns and perform `fit_transform`
encoder = make_column_transformer((OneHotEncoder(handle_unknown = 'ignore', sparse = False, dtype = transf_dtype), ['Sex']), remainder='passthrough')
transformed = encoder.fit_transform(X)

#Since `fit_transform` returns an `ndarray`, but a dataframe is needed
# Column Sex has M,F,I value
encX = pd.DataFrame(transformed, columns = encoder.get_feature_names())
encX.rename(columns = {'onehotencoder__x0_F':'Female', 'onehotencoder__x0_I':'Indefinite', 'onehotencoder__x0_M':'Male'}, inplace = True)                   #renaming of the freshly added columns
encX

# machineLearning-03c-prepr-dissim.pdf 26/71
# OrdinalEncoder (from Ordinal to Numerical )
# In order to do a classification, our column_to_convert column has to become numerical
# from 1 column to 1 column of range -1 to +1
encoder = OrdinalEncoder()
df['column_to_convert'] = encoder.fit_transform(df['column_to_convert'].values.reshape(-1,1))

### Snippets utili (Liste)

In [None]:
# Remove an item (target) from a list
list_name.remove(target)

# Sort the values
list_name.sort()

# Append an item (target) to a list (put it last)
list_name.append(target)

## 2. Classificazione

### Divisione in X e y

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(target, axis = 1)
y = df[target]

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,train_size = 2/3, random_state = random_state)
print(f"We have {Xtrain.shape[0]} items in our training set")
print(f"We have {Xtest.shape[0]} items in our test set")

### Train Validation Test (ulteriore divisione del train per ottenere validation)

In [None]:
Xtrain2, Xval, ytrain2, yval = train_test_split(Xtrain, ytrain, random_state= random_state)

print(f"We have {Xtrain2.shape[0]} items in our train set")
print(f"We have {Xval.shape[0]} items in our validation set")

In [None]:
# With validation  train 50% , test 25% , val 25%
Xtrain, Xtest1, ytrain, ytest1 = train_test_split(     X,      y,train_size = 1/2, random_state = random_state)
Xtest, Xval, ytest, yval = train_test_split(Xtest1, ytest1,train_size = 1/2, random_state = random_state)
print(f"We have {Xtrain.shape[0]} items in our training set")
print(f"We have {Xtest.shape[0]} items in our test set")
print(f"We have {Xval.shape[0]} items in our validation set")

Now we can save the depth of the tree (DecisionTree) with default hyperparameters. This way, we can vary the depths in order to see what is the best fit for our data.

In [None]:
# dt comes from a previous creation of a DecisionTree Classifier

default_depth = dt.tree_.max_depth
range_depth = range(1, default_depth+1)

#use accuracy as method of evaluation
scores= []
for i in range_depth:
    current_model = DecisionTreeClassifier(criterion="entropy", max_depth=i, random_state=random_state)

    current_model.fit(Xtrain2,ytrain2)
    yval_predicted = current_model.predict(Xval)
    scores.append([i, accuracy_score(yval, yval_predicted)*100])

print(scores)

We now have a look at the accuracy scores that we obtained in the previous step

In [None]:
#now we insert the scores in a dataframe to get the best parameters easily
score_df = pd.DataFrame(data=scores, columns=["max_depth", "accuracy_score"])
#order dataframe to get best accuracy score
score_df = score_df.sort_values(by=["accuracy_score"], ascending=False)
score_df.head(1)

### DecisionTree Classifier

#### i. Algoritmo generico

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Instantiate the DecisionTree Classifier
dt = DecisionTreeClassifier(random_state = random_state)

# Fit it to the training data
dt.fit(Xtrain, ytrain)

# Try to predict training data
dt_train_prediction = dt.predict(Xtrain)

# Try to predict test data
dt_test_prediction = dt.predict(Xtest)

# Compute the accuracy score for the predictions
dt_train_accuracy = accuracy_score(ytrain, dt_train_prediction) * 100
dt_test_accuracy = accuracy_score(ytest, dt_test_prediction) * 100

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
def get_accuracy(estimator,Xtrain,Xtest,ytrain,ytest):
    # Fit it to the training data
    estimator.fit(Xtrain, ytrain)

    # Try to predict training data
    train_prediction = estimator.predict(Xtrain)

    # Try to predict test data
    test_prediction = estimator.predict(Xtest)

    # Compute the accuracy score for the predictions
    train_accuracy = accuracy_score(ytrain, train_prediction) * 100
    test_accuracy = accuracy_score(ytest, test_prediction) * 100

    return train_accuracy,test_accuracy

# Usage
# Instantiate the DecisionTree Classifier
dt = DecisionTreeClassifier(random_state = random_state)
# Instantiate the Linear Perceptron
lp = Perceptron(random_state = random_state)
# Instantiate the KNN Classifier
knn = KNeighborsClassifier()

train_accuracy,test_accuracy = get_accuracy(dt,Xtrain,Xtest,ytrain,ytest)

print(f"The decision tree had an accuracy of {train_accuracy:.2f} on the training set and {test_accuracy:.2f} on the test set")

#### ii. Ricerca dei migliori parametri

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Instantiate the DecisionTree Classifier
dt = DecisionTreeClassifier(random_state = random_state)

# Fit it to the training data
dt.fit(Xtrain, ytrain)

# Create the range of parameters to try during cross-validation
dt_depths = range(1, dt.get_depth() + 1)

# We will use GridSearchCV to perform cross-validation
# we need to create the parameter list in a specific way
# for it to work
dt_params = [{'max_depth': list(dt_depths), 'random_state': [random_state]}]

# Scoring
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
scoring = 'accuracy'

# Instantiate GridSearchCV
dt_gs = GridSearchCV(   dt,
                        dt_params,
                        cv=5,
                        scoring=scoring,
                        return_train_score = False,
                        n_jobs = 2,
                    )

# Fit it to the training data
dt_gs.fit(Xtrain, ytrain)

# Print the best parameters found
print(f"The best parameter found for the Decision Tree was {dt_gs.best_params_}")

In [None]:
dt = DecisionTreeClassifier(max_depth = dt_gs.best_params_["max_depth"], random_state = random_state)
dt.fit(Xtrain, ytrain)

# Predict the test set in order to be able to compute the metrics later on
dt_ypred = dt.predict(Xtest)

#### ii.1 Ricerca dei migliori parametri con una funzione

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
def get_best_hyperparameter_decision_tree(random_state,X, y,scoring):
    # Instantiate the DecisionTree Classifier
    estimator = DecisionTreeClassifier(random_state = random_state)

    # Fit it to the training data
    estimator.fit(X, y)

    # Create the range of parameters to try during cross-validation
    estimator_depths = range(1, estimator.get_depth() + 1)

    # We will use GridSearchCV to perform cross-validation
    # we need to create the parameter list in a specific way
    # for it to work
    estimator_params = [{'max_depth': list(estimator_depths), 'random_state': [random_state]}]

    # Instantiate GridSearchCV
    estimator_gs = GridSearchCV(   estimator,
                                    estimator_params,
                                    cv=5,
                                    scoring=scoring,
                                    return_train_score = False,
                                    n_jobs = 2,
                        )

    # Fit it to the training data
    estimator_gs.fit(X, y)

    # Print the best parameters found
    print(f"The best parameter found for the Decision Tree was {estimator_gs.best_params_}")
    return estimator_gs.best_params_['max_depth']

# USAGE
scoring = 'accuracy'
best_parameter = get_best_hyperparameter_decision_tree(random_state,Xtrain, ytrain,scoring)

#### iii. Calcolo Accuracy, Confusion Matrix e Classification Report con parametri migliori

In [None]:
dt = DecisionTreeClassifier(max_depth = dt_best_parameter, random_state = random_state)
dt.fit(Xtrain, ytrain)

# Predict the test set in order to be able to compute the metrics later on
dt_ypred = dt.predict(Xtest)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Accuracy score
dt_accuracy = accuracy_score(ytest, dt_ypred) * 100
print(f"The accuracy score for the Decision Tree with the optimized hyperparameters was: {dt_accuracy:.2f}%")

# Confusion matrix
print("The confusion matrix is:")
cm_DT = confusion_matrix(ytest,dt_ypred)
CMD = ConfusionMatrixDisplay(cm_DT)
CMD.plot()

# Classification Report
print("Classification report: ")
print(classification_report(ytest, dt_ypred))

### K-nearest Neighbors

#### i. Algoritmo generico

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Instantiate the KNN Classifier
knn = KNeighborsClassifier()

# Fit it to the training data
knn.fit(Xtrain, ytrain)

# Try to predict training data
knn_train_prediction = knn.predict(Xtrain)

# Try to predict test data
knn_test_prediction = knn.predict(Xtest)

# Compute the accuracy score for the predictions
knn_train_accuracy = accuracy_score(ytrain, knn_train_prediction) * 100
knn_test_accuracy = accuracy_score(ytest, knn_test_prediction) * 100

#### ii. Ricerca dei migliori parametri

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Instantiate the KNN Classifier
knn = KNeighborsClassifier()

# Create the range of parameters to try during cross-validation
knn_neighbors = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# We will use GridSearchCV to perform cross-validation
# we need to create the parameter list in a specific way
# for it to work
knn_params = [{'n_neighbors': knn_neighbors}]

# Instantiate GridSearchCV
knn_gs = GridSearchCV(  knn,
                        knn_params,
                        cv=5,
                        scoring='accuracy',             # [CAMBIARE]
                        return_train_score = False,
                        n_jobs = 2,
                )

# Fit it to the training data
knn_gs.fit(Xtrain, ytrain)

# Print the best parameters found
print(f"The best parameter found for the Nearest Neighbors was {knn_gs.best_params_}")

#### ii.1 Ricerca dei parametri migliori con funzione

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
def get_best_hyperparameter_nearest_neighbors(X, y,scoring):
    # Instantiate the KNN Classifier
    knn = KNeighborsClassifier()

    # Create the range of parameters to try during cross-validation
    knn_neighbors = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

    # We will use GridSearchCV to perform cross-validation
    # we need to create the parameter list in a specific way
    # for it to work
    knn_params = [{'n_neighbors': knn_neighbors}]

    # Instantiate GridSearchCV
    knn_gs = GridSearchCV(  knn,
                            knn_params,
                            cv=5,
                            scoring=scoring,
                            return_train_score = False,
                            n_jobs = 2,
                    )

    # Fit it to the training data
    knn_gs.fit(Xtrain, ytrain)

    # Print the best parameters found
    print(f"The best parameter found for the Nearest Neighbors was {knn_gs.best_params_}")
    return knn_gs.best_params_['n_neighbors']

# USAGE
scoring = 'accuracy'
best_parameter = get_best_hyperparameter_nearest_neighbors(Xtrain, ytrain,scoring)

In [None]:
# Instantiate the KNN Classifier
knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(Xtrain, ytrain)

# Predict the test set in order to be able to compute the metrics later on
knn_ypred = knn.predict(Xtest)

#### iii. Calcolo Accuracy, Confusion Matrix e Classification Report con parametri migliori

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Accuracy score
knn_accuracy = accuracy_score(ytest, knn_ypred) * 100
print(f"The accuracy score for the Nearest Neighbors with the optimized hyperparameters was: {knn_accuracy:.2f}%")

# Confusion matrix
print("The confusion matrix is:")
cm_KNN = confusion_matrix(ytest,knn_ypred)
CMD = ConfusionMatrixDisplay(cm_KNN)
CMD.plot()

# Classification Report
print("Classification report: ")
print(classification_report(ytest, knn_ypred))

In [None]:
# Model labels to facilitate iterations
model_lbls = ['dt', 'lp', 'knn']
# We will evaluate classification via the precision metric
score = 'precision'
# Parameters for each classifier
tuned_param_dt = [{'max_depth': list(range(1,dt.get_depth() + 1)),'random_state': [random_state]}]
tuned_param_lp = [{'early_stopping': [True], 'random_state': [random_state]}]
tuned_param_knn =[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]
# set the models to be fitted specifying name, estimator and parameter structure
models = {
            'dt':   {
                        'name': 'Decision Tree ',
                        'estimator': DecisionTreeClassifier(),
                        'param': tuned_param_dt,
                    },
            'lp':   {
                        'name': 'Linear Perceptron ',
                        'estimator': Perceptron(),
                        'param': tuned_param_lp,
                    },
            'knn':
                    {   'name': 'K Nearest Neighbor ',
                        'estimator': KNeighborsClassifier(),
                        'param': tuned_param_knn
                    }
        }

def get_accuracy_and_confusionmatrix(estimator,Xtrain, ytrain,ytest):
    estimator.fit(Xtrain, ytrain)
    ypred = estimator.predict(Xtest)
    accuracy = accuracy_score(ytest, ypred) * 100
    cm = confusion_matrix(ytest, ypred)
    return accuracy,cm


def print_accuracy_and_confusionmatrix(accuracy,cm,model_name):
    print(f"The accuracy score for the {model_name} with the optimized hyperparameters was: {accuracy:.2f}%")

    # Confusion matrix
    print("The confusion matrix is:")
    print(cm)

accuracies = []
cms =[]
model_names = []

for m in model_lbls:
    estimator = models[m]['estimator']
    model_name = models[m]['name']
    accuracy,cm = get_accuracy_and_confusionmatrix(estimator,Xtrain, ytrain,ytest)
    print_accuracy_and_confusionmatrix(accuracy,cm,model_name)
    accuracies.append(accuracy)
    cms.append(cm)
    model_names.append(model_names)

### Linear Perceptron

#### i. Algoritmo generico

In [None]:
from sklearn.linear_model import Perceptron

# Instantiate the Linear Perceptron
lp = Perceptron(random_state = random_state)

# Fit it to the training data
lp.fit(Xtrain, ytrain)

# Try to predict training data
lp_train_prediction = lp.predict(Xtrain)

# Try to predict test data
lp_test_prediction = lp.predict(Xtest)

# Compute the accuracy score for the predictions
lp_train_accuracy = accuracy_score(ytrain, lp_train_prediction) * 100
lp_test_accuracy = accuracy_score(ytest, lp_test_prediction) * 100

#### iii. Calcolo Accuracy, Confusion Matrix e Classification Report con parametri migliori

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Accuracy score
lp_accuracy = accuracy_score(ytest, lp_test_prediction) * 100
print(f"The accuracy score for the Decision Tree with the optimized hyperparameters was: {lp_accuracy:.2f}%")

# Confusion matrix
print("The confusion matrix is:")
cm_LP = confusion_matrix(ytest,lp_test_prediction)
CMD = ConfusionMatrixDisplay(cm_LP)
CMD.plot()

# Classification report
print("Classification report: ")
print(classification_report(ytest, lp_test_prediction))

## 3. Confronto

### Performance con migliori parametri e matrici di confusione

In [None]:
#Codice adattato da marco lorenzo per stampa ridotta e sistemata; (non salva precision e recall in strutture dati ma li stampa alla fine)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

model_lbls = [
              'dt',
              'lp',
             'knn'
            ]

# Set the parameters by cross-validation [CAMBIARE PARAMETRI]

models = {
    'dt': {'name': 'Decision Tree       ',
           'estimator': DecisionTreeClassifier(max_depth = 5, random_state = random_state),
          },
    'lp': {'name': 'Linear Perceptron   ',
           'estimator': Perceptron(early_stopping=True, random_state=random_state),
          },
    'knn':{'name': 'K Nearest Neighbor ',
           'estimator': KNeighborsClassifier(n_neighbors=5),
       }
}

dictModel_pred = {}
dictModel_accuracy = {}
dictModel_cm = {}


def multipleClassifiers(model):
    model.fit(Xtrain, ytrain)
    model_pred = model.predict(Xtest)
    model_accuracy = accuracy_score(ytest, model_pred) * 100
    model_cm = confusion_matrix(ytest, model_pred)

    dictModel_pred[model] = model_pred
    dictModel_accuracy[model] = model_accuracy
    dictModel_cm[model] = model_cm


for model_lb in model_lbls:
    model = models[model_lb]["estimator"]
    multipleClassifiers(model)
    print("\n"+str(model)+":")
    print("Accuracy: "+str(dictModel_accuracy[model]))
    print("Confusion matrix: ")
    print(dictModel_cm[model])
    print("Classification report: ")
    print(classification_report(ytest, dictModel_pred[model]))

### Performance con migliori parametri e matrici di confusione V2 : a more generalized solution

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
# Model labels to facilitate iterations
model_lbls = ['dt','lp','knn']

# set the models to be fitted specifying name, estimator and parameter structure
models = {
            'dt':   {
                        'name': 'Decision Tree ',
                        'estimator': DecisionTreeClassifier()
                    },
            'lp':   {
                        'name': 'Linear Perceptron   ',
                        'estimator': Perceptron(early_stopping=True, random_state=random_state)
                    },
            'knn':
                    {   'name': 'K Nearest Neighbor ',
                        'estimator': KNeighborsClassifier()
                    }
        }

In [None]:
from sklearn.model_selection import GridSearchCV

scoring = 'accuracy'

for model_name in model_lbls:
    estimator = models[model_name]['estimator']
    model_name_extended = models[model_name]['name']

    if model_name == 'dt' :
        # Fit it to the training data
        estimator.fit(Xtrain, ytrain)
        model_params = [{'max_depth': list(range(1,estimator.get_depth() + 1)),'random_state': [random_state]}]

    if model_name == 'lp' :
         model_params = [{'random_state': [random_state]}]

    if model_name == 'knn' :
         model_params = [{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

    # Instantiate GridSearchCV
    model_gs = GridSearchCV(estimator,
                            model_params,
                            cv=5,
                            scoring=scoring,
                            return_train_score = False,
                            n_jobs = 2,
                    )

    # Fit it to the training data
    model_gs.fit(Xtrain, ytrain)

    # Print the best parameters found
    print(f"The best parameter found for the {model_name_extended} was {model_gs.best_params_}")

    if model_name == 'dt' :
        # Instantiate the Deccision Tree Classifier
        estimator = DecisionTreeClassifier(max_depth = model_gs.best_params_["max_depth"], random_state = random_state)

    if model_name == 'lp' :
        # Instantiate the Perceptron Classifier
        estimator = Perceptron(random_state = random_state)

    if model_name == 'knn' :
        # Instantiate the KNN Classifier
        estimator = KNeighborsClassifier(n_neighbors = model_gs.best_params_["n_neighbors"])


    estimator.fit(Xtrain, ytrain)

    # Predict the test set in order to be able to compute the metrics later on
    model_ypred = estimator.predict(Xtest)

    # Classification Report
    print("Classification report: ")
    print(classification_report(ytest, model_ypred))

    # Confusion matrix
    cm_model = confusion_matrix(ytest,model_ypred)
    CMD = ConfusionMatrixDisplay(cm_model)
    CMD.plot()
    CMD.ax_.set_title(f"The confusion matrix for {model_name_extended}")

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

scoring = 'accuracy'

# Model labels to facilitate iterations
model_lbls = ['dt','lp','knn']

# set the models to be fitted specifying name, estimator and parameter structure
models = {
            'dt':   {
                        'name': 'Decision Tree ',
                        'estimator': DecisionTreeClassifier(),
                        'accuracy' : 0
                    },
            'lp':   {
                        'name': 'Linear Perceptron   ',
                        'estimator': Perceptron(early_stopping=True, random_state=random_state),
                        'accuracy' : 0
                    },
            'knn':
                    {   'name': 'K Nearest Neighbor ',
                        'estimator': KNeighborsClassifier(),
                        'accuracy' : 0
                    }
        }

for model_name in model_lbls:
    estimator = models[model_name]['estimator']
    model_name_extended = models[model_name]['name']

    if model_name == 'dt' :
        # Fit it to the training data
        estimator.fit(Xtrain, ytrain)
        model_params = [{'max_depth': list(range(1,estimator.get_depth() + 1)),'random_state': [random_state]}]

    if model_name == 'lp' :
         model_params = [{'random_state': [random_state]}]

    if model_name == 'knn' :
         model_params = [{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]


    # 5. For each classification method find the best parameter setting with cross validation on the training set

    # Instantiate GridSearchCV
    model_gs = GridSearchCV(estimator,
                            model_params,
                            cv=5,
                            scoring=scoring,
                            return_train_score = False,
                            n_jobs = 2,
                    )

    # Fit it to the training data
    model_gs.fit(Xtrain, ytrain)

    # Print the best parameters found
    print(f"The best parameter found for the {model_name_extended} was {model_gs.best_params_}")

    if model_name == 'dt' :
        # Instantiate the Deccision Tree Classifier
        estimator = DecisionTreeClassifier(max_depth = model_gs.best_params_["max_depth"], random_state = random_state)

    if model_name == 'lp' :
        # Instantiate the Perceptron Classifier
        estimator = Perceptron(random_state = random_state)

    if model_name == 'knn' :
        # Instantiate the KNN Classifier
        estimator = KNeighborsClassifier(n_neighbors = model_gs.best_params_["n_neighbors"])

    estimator.fit(Xtrain, ytrain)

    # 6. For each classification method compute the accuracy and the confusion matrix on the test set

    # Predict the test set in order to be able to compute the metrics later on
    model_ypred = estimator.predict(Xtest)

    # Classification Report
    print("Classification report: ")
    print(classification_report(ytest, model_ypred))


    if model_name == 'dt' :
        # Accuracy of the Decision Tree Classifier

        models[model_name]['accuracy'] = accuracy_score(ytest, model_ypred) * 100

    if model_name == 'lp' :
        # Accuracy of the Perceptron Classifier
        models[model_name]['accuracy'] = accuracy_score(ytest, model_ypred) * 100

    if model_name == 'knn' :
        # Accuracy of the KNN Classifier
        models[model_name]['accuracy'] = accuracy_score(ytest, model_ypred) * 100

    # Confusion matrix
    cm_model = confusion_matrix(ytest,model_ypred)
    CMD = ConfusionMatrixDisplay(cm_model)
    CMD.plot()
    CMD.ax_.set_title(f"The confusion matrix for {model_name_extended}")

### Grafico di confronto delle Accuracies

In [None]:
# Codice sui tre modelli [MODIFICARE DI CONSEGUENZA]
classifier_list = ['Decision Tree', 'Linear Perceptron', 'K-Nearest Neighbors']
acc_list = [dt_accuracy, lp_accuracy, knn_accuracy]
plt.title('Accuracy of each classifier')
plt.bar(classifier_list, acc_list)

## 4. Trasformazioni dei valori

### Miglioramento dei risultati (operazione Logaritmica)

In [None]:
# Logarithmic transformation
for column in X.columns:

    # We don't want to transform columns with values
    # Lower than or equal to zero
    if (X[column] <= 0).any().any():
        continue

    X[column] = np.log(X[column])

### MinMax Scaler: trasformazione tutti valori in un range da 0 a 1

In [None]:
# remap on the 0:1 range with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X = pd.DataFrame(mms.fit_transform(X), columns = X.columns)
X.head()

### Square Root transformation

In [None]:
# square root transformation - the first two columns are not transformed
from math import sqrt

X_sqrt = pd.concat([X.iloc[:,:2],X.iloc[:,2:].applymap(sqrt)],axis=1)

### MaxDiag

In [None]:
# Professor function
from max_diag import max_diag

# Apply on a confusion matrix
cm_km = max_diag(cm)
CMD = ConfusionMatrixDisplay(cm_km)
CMD.plot()

# To see why it is useful
help(max_diag)

## 4. Grafici

### 4.1 scatterplot good/bad

In [None]:
# X all attributes column
# y target column
# focus1 and focus2 : column used to plot
# estimator : example DecisionTreeClassifier
def plot_scatterplot_prediction_good_bad(X,y,focus1,focus2,estimator):
    column_prediction = 'y_test_predition'
    y_test_predition = estimator.predict(X)
    df_plot = pd.concat([X, y], axis=1)
    df_plot[column_prediction] = y_test_predition
    sns.scatterplot(x=focus1, y=focus2, data=df_plot, hue=column_prediction ,style = y == df_plot[column_prediction] )