## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# importar os pacotes necessários para os algoritmos de classificação
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Carga dos dados de entrada (treino e teste)

In [3]:
# carregar arquivo de dados de treino
train_data = pd.read_csv('iris-train.csv', index_col='Id')

# mostrar alguns exemplos de registros
train_data.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
47,5.1,3.8,1.6,0.2,Iris-setosa
37,5.5,3.5,1.3,0.2,Iris-setosa
50,5.0,3.3,1.4,0.2,Iris-setosa
79,6.0,2.9,4.5,1.5,Iris-versicolor
44,5.0,3.5,1.6,0.6,Iris-setosa


In [4]:
# carregar arquivo de dados de teste
test_data = pd.read_csv('iris-test.csv', index_col='Id')

# mostrar alguns exemplos de registros
test_data.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
72,6.1,2.8,4.0,1.3
78,6.7,3.0,5.0,1.7
139,6.0,3.0,4.8,1.8
135,6.1,2.6,5.6,1.4
27,5.0,3.4,1.6,0.4


In [5]:
# definir dados de treino

X_train = train_data.drop(['Species'], axis=1) # tudo, exceto a coluna alvo
y_train = train_data['Species'] # apenas a coluna alvo

print('Forma dos dados de treino:', X_train.shape, y_train.shape)

Forma dos dados de treino: (100, 4) (100,)


In [6]:
# definir dados de teste

X_test = test_data # tudo, já que não possui a coluna alvo

print('Forma dos dados de teste:', X_test.shape)

Forma dos dados de teste: (50, 4)


## Transformações nos dados

## Treinamento do modelo preditivo

In [7]:
# definir modelo a ser gerado
#model = KNeighborsClassifier(n_neighbors=3)
#model = SVC(random_state=42, C=1, gamma=0.001, kernel='linear')
#model = MLPClassifier(random_state=42, solver='lbfgs', alpha=1, hidden_layer_sizes=(15,))
#model = LogisticRegression(random_state=42, solver='lbfgs', multi_class='auto', max_iter=500, C=10)
#model = DecisionTreeClassifier(random_state=42, max_depth=5, criterion='entropy')
model = LinearDiscriminantAnalysis(solver='svd')
#model = RandomForestClassifier(random_state=42, max_features='auto', n_estimators=100)
#model = AdaBoostClassifier(DecisionTreeClassifier(random_state=42), n_estimators=1)
#model = GradientBoostingClassifier(random_state=42, max_depth=5)
#model = GaussianNB(priors=None, var_smoothing=1e-08)
#model = LinearSVC(random_state=42, max_iter=1000, C=1)

# definir sufixo
suffix = 'lda'

print(model)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


In [8]:
model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

## Predição dos resultados e criação do arquivo de envio

In [9]:
# executar previsão usando o modelo escolhido
y_pred = model.predict(X_test)

print('Exemplos de previsões:\n', y_pred[:10])

Exemplos de previsões:
 ['Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-virginica' 'Iris-setosa']


In [10]:
# gerar dados de envio (submissão)

submission = pd.DataFrame({
  'Id': X_test.index,
  'Species': y_pred
})
submission.set_index('Id', inplace=True)

# mostrar dados de exemplo
#submission.head(10)

In [11]:
# gerar arquivo CSV para o envio
arquivo = 'iris-submission-' + suffix + '.csv'
submission.to_csv(arquivo)

In [12]:
# verificar conteúdo do arquivo gerado
!head $arquivo

Id,Species
72,Iris-versicolor
78,Iris-virginica
139,Iris-virginica
135,Iris-virginica
27,Iris-setosa
6,Iris-setosa
118,Iris-virginica
57,Iris-versicolor
103,Iris-virginica


## Verificação contra os dados reais

In [13]:
# carregar arquivo de dados de teste
real_data = pd.read_csv('iris-solution.csv', index_col='Id')

# mostrar alguns exemplos de registros
real_data.head()

Unnamed: 0_level_0,Species
Id,Unnamed: 1_level_1
72,Iris-versicolor
78,Iris-versicolor
139,Iris-virginica
135,Iris-virginica
27,Iris-setosa


In [14]:
y_real = real_data.Species

In [15]:
submission['Expected'] = y_real 
submission['Correct'] = (y_pred == y_real)
submission[submission.Correct == False].head()

Unnamed: 0_level_0,Species,Expected,Correct
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
78,Iris-virginica,Iris-versicolor,False
84,Iris-virginica,Iris-versicolor,False
