In [26]:
# 1. Baixar os arquivos referentes as bases de dados e acessá-las pelo collab.
# 2. Obter informações sobre números de features e instâncias dos datasets.
# 3. Identificar a existência de dados faltantes nos datasets.
# 4. Separar os conjuntos de treino e teste, usando a função train_test_slipt, com
# test_size = 0.25 e random_state = 42.
# 5. Importar o sklearn para:
# 6. Aplicar à base diabetes_numeric.csv o modelo de regressão linear.
# 7. Avaliar as métricas R2, MAE e MSE.

In [27]:
# imports das bibliotecas
import pandas as pd 
import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn import metrics

In [28]:
# fazendo a leitura da base diabetes_numeric.csv
df_diabetes = pd.read_csv('./data/diabetes_numeric.csv')
df_diabetes.head(5)

Unnamed: 0,age,deficit,c_peptide
0,5.2,-8.1,4.8
1,8.8,-16.1,4.1
2,10.5,-0.9,5.2
3,10.6,-7.8,5.5
4,10.4,-29.0,5.0


In [29]:
# Obter informações sobre números de features e instâncias dos datasets.
df_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        43 non-null     float64
 1   deficit    43 non-null     float64
 2   c_peptide  43 non-null     float64
dtypes: float64(3)
memory usage: 1.1 KB


In [30]:
# descrição dos dados
df_diabetes.describe()

Unnamed: 0,age,deficit,c_peptide
count,43.0,43.0,43.0
mean,9.032558,-8.148837,4.746512
std,4.022539,7.12308,0.720565
min,0.9,-29.0,3.0
25%,5.5,-12.7,4.45
50%,10.4,-7.8,4.9
75%,11.85,-2.0,5.1
max,15.6,-0.2,6.6


In [31]:
# Separar os conjuntos de treino e teste, usando a função train_test_slipt, com
# test_size = 0.25 e random_state = 42
X = df_diabetes[['age', 'deficit']]
y = df_diabetes[['c_peptide']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [32]:
# 6. Aplicar à base diabetes_numeric.csv o modelo de regressão linear.
regression = LinearRegression().fit(X_train, y_train)
y_regr_pred = regression.predict(X_test)

In [33]:
# 7. Avaliar as métricas R2, MAE e MSE
print('R2:', metrics.r2_score(y_test, y_regr_pred))
print('MAE:', metrics.mean_absolute_error(y_test, y_regr_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_regr_pred))

R2: 0.21880597788432865
MAE: 0.6107422323514548
MSE: 0.47452694731819717


In [34]:
# 8. Aplicar à base bloodtransf.csv o modelo SVC, com kernel=rbf.
# 9. Avaliar as métricas Acurácia, Precision, Recall, F1 e AUROC
df_blood = pd.read_csv('./data/bloodtransf.csv')
df_blood.head(5)

Unnamed: 0,V1,V2,V3,V4,clazz
0,2,50,12500,98,2
1,0,13,3250,28,2
2,1,16,4000,35,2
3,2,20,5000,45,2
4,1,24,6000,77,1


In [35]:
df_blood.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   V1      748 non-null    int64
 1   V2      748 non-null    int64
 2   V3      748 non-null    int64
 3   V4      748 non-null    int64
 4   clazz   748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [36]:
df_blood.describe()

Unnamed: 0,V1,V2,V3,V4,clazz
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,1.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,1.0
25%,2.75,2.0,500.0,16.0,1.0
50%,7.0,4.0,1000.0,28.0,1.0
75%,14.0,7.0,1750.0,50.0,1.0
max,74.0,50.0,12500.0,98.0,2.0


In [37]:
df_blood.columns

Index(['V1', 'V2', 'V3', 'V4', 'clazz'], dtype='object')

In [38]:
# Vamos fazer um mapeamento das classes originais para 0, 1 e 2.
name_to_class = {
    1: 0,
    2: 1
}
#substituindo os valores categóricos pelo mapeamento
df_blood['clazz'] = df_blood['clazz'].map(name_to_class)

#check
df_blood.head(5)

Unnamed: 0,V1,V2,V3,V4,clazz
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [39]:
# Separar os conjuntos de treino e teste, usando a função train_test_slipt, com
# test_size = 0.25 e random_state = 42
X = df_blood[['V1', 'V2', 'V3', 'V4']]
y = df_blood[['clazz']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [40]:
# 8. Aplicar à base bloodtransf.csv o modelo SVC, com kernel=rbf.
# 9. Avaliar as métricas Acurácia, Precision, Recall, F1 e AUROC
svc = SVC(kernel='rbf').fit(X_train,y_train)
y_svc_pred = svc.predict(X_test)

In [41]:
print('\nClassification Report\n', metrics.classification_report(y_test, y_svc_pred))


Classification Report
               precision    recall  f1-score   support

           0       0.75      1.00      0.86       139
           1       1.00      0.02      0.04        48

    accuracy                           0.75       187
   macro avg       0.87      0.51      0.45       187
weighted avg       0.81      0.75      0.65       187



In [42]:
print('Acuracia:', metrics.accuracy_score(y_test, y_svc_pred))
print('Precision:', metrics.precision_score(y_test, y_svc_pred))
print('Recall:', metrics.recall_score(y_test, y_svc_pred))
print('F1:', metrics.f1_score(y_test, y_svc_pred))
print('AUCROC:', metrics.roc_auc_score(y_test, y_svc_pred))

Acuracia: 0.7486631016042781
Precision: 1.0
Recall: 0.020833333333333332
F1: 0.04081632653061225
AUCROC: 0.5104166666666666


In [43]:
# Ler a base de dados wine.csv
df_wine = pd.read_csv('./data/wine.csv')
df_wine.head(5)

Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [44]:
# ver as informacoes da base
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   class                           178 non-null    int64  
 1   Alcohol                         178 non-null    float64
 2   Malic_acid                      178 non-null    float64
 3   Ash                             178 non-null    float64
 4   Alcalinity_of_ash               178 non-null    float64
 5   Magnesium                       178 non-null    int64  
 6   Total_phenols                   178 non-null    float64
 7   Flavanoids                      178 non-null    float64
 8   Nonflavanoid_phenols            178 non-null    float64
 9   Proanthocyanins                 178 non-null    float64
 10  Color_intensity                 178 non-null    float64
 11  Hue                             178 non-null    float64
 12  OD280%2FOD315_of_diluted_wines  178 

In [45]:
# ver as informacoes estatisticas da base
df_wine.describe()

Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [46]:
# Vamos fazer um mapeamento das classes originais para 0, 1 e 2.
name_to_class = {
    1: 0,
    2: 1,
    3: 2

}
#substituindo os valores categóricos pelo mapeamento
df_wine['class'] = df_wine['class'].map(name_to_class)

#check
df_wine.head(5)

Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
0,0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,0,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,0,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,0,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,0,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [47]:
df_wine.columns

Index(['class', 'Alcohol', 'Malic_acid', 'Ash', 'Alcalinity_of_ash',
       'Magnesium', 'Total_phenols', 'Flavanoids', 'Nonflavanoid_phenols',
       'Proanthocyanins', 'Color_intensity', 'Hue',
       'OD280%2FOD315_of_diluted_wines', 'Proline'],
      dtype='object')

In [48]:
# Dividir a base em treinamento e teste
X = df_wine[['Alcohol', 'Malic_acid', 'Ash', 'Alcalinity_of_ash',
       'Magnesium', 'Total_phenols', 'Flavanoids', 'Nonflavanoid_phenols',
       'Proanthocyanins', 'Color_intensity', 'Hue',
       'OD280%2FOD315_of_diluted_wines', 'Proline']]
y = df_wine[['class']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [49]:
# 10. Aplicar à base wine.csv o modelo kmeans.
# 11. Identificar o número de clusters mais adequado de acordo com o dataset
clustering = KMeans(n_clusters = 3, random_state = 42).fit(X_train)
predict_clustering = clustering.predict(X_test)

In [50]:
valores = y_test['class']
valores.shape

(45,)

In [51]:
# 12. Avaliar as métricas Coeficiente de Silhueta, Davies-Bouldin Score e Mutual
# Information.
print('Coeficiente de Silhueta\n', metrics.silhouette_score(X_test, predict_clustering)) 
print('\nDavies-Bouldin Score\n', metrics.davies_bouldin_score(X_test, predict_clustering)) 
print('\nMutual information\n', metrics.mutual_info_score(y_test['class'], predict_clustering)) 

Coeficiente de Silhueta
 0.5519241838976598

Davies-Bouldin Score
 0.5415115100039726

Mutual information
 0.526053317723497
