## Importação dos pacotes

In [10]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [11]:
# importar os pacotes necessários para os algoritmos de regressão
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.ensemble import VotingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

## Carga dos dados de entrada

In [12]:
# carregar arquivo de dados de treino
train_data = pd.read_csv('abalone-train.csv', index_col='id')

In [13]:
# carregar arquivo de dados de teste
test_data = pd.read_csv('abalone-test.csv', index_col='id')

In [18]:
data_sex = pd.get_dummies(train_data['sex'], prefix='sex')
train_data = train_data.join(data_sex)
train_data.drop('sex', axis=1, inplace=True)

data_sex = pd.get_dummies(test_data['sex'], prefix='sex')
test_data = test_data.join(data_sex)
test_data.drop('sex', axis=1, inplace=True)

## Seleção dos dados de treino e teste

In [21]:
# definir dados de treino

X_train = train_data.drop(['rings'], axis=1) # tudo, exceto a coluna alvo
y_train = train_data['rings'] # apenas a coluna alvo

print('Forma dos dados de treino:', X_train.shape, y_train.shape)

Forma dos dados de treino: (2784, 10) (2784,)


In [22]:
# definir dados de teste

X_test = test_data # tudo, já que não possui a coluna alvo

print('Forma dos dados de teste:', X_test.shape)

Forma dos dados de teste: (1393, 10)


## Treinamento dos modelos e geração dos resultados 

In [23]:
models = []
models.append(('LinReg', LinearRegression()))
models.append(('LogReg', LogisticRegression(random_state=42, solver='lbfgs', multi_class='auto')))
models.append(('KNN', KNeighborsRegressor()))
models.append(('SVM', SVR(gamma='auto')))
models.append(('RF', RandomForestRegressor(random_state=42)))
models.append(('GB', GradientBoostingRegressor(random_state=42)))
models.append(('DT', DecisionTreeRegressor(random_state=42)))
models.append(('GP', GaussianProcessRegressor()))
models.append(('MLP', MLPRegressor(random_state=42)))

In [24]:
for name, model in models:
    print(model, '\n')
    
    # treinar o modelo
    model.fit(X_train, y_train)
    
    # executar previsão usando o modelo
    y_pred = model.predict(X_test)
    
    # gerar dados de envio (submissão)
    submission = pd.DataFrame({
      'id': X_test.index,
      'rings': y_pred
    })
    submission.set_index('id', inplace=True)

    # gerar arquivo CSV para o envio
    filename = 'abalone-submission-p-%s.csv' % name.lower()
    submission.to_csv(filename)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False) 





KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform') 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False) 





GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=42, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False) 

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best') 

GaussianProcessRegressor(alpha=1e-10, copy_X_train=True, kernel=None,
             n_restarts_optimizer=0, normalize_y=False,
             optimizer='fmin_l_bf



In [25]:
# verificar conteúdo dos arquivos gerados
!head abalone-submission-p-*.csv

==> abalone-submission-p-dt.csv <==
id,rings
1512,11.0
1401,12.0
509,17.0
2883,9.0
3259,9.0
2135,11.0
1499,9.0
3597,13.0
2414,8.0

==> abalone-submission-p-gb.csv <==
id,rings
1512,10.009380909708643
1401,11.151726312062774
509,12.018518145626894
2883,7.810752670775166
3259,11.15306370585676
2135,9.317058657838714
1499,10.05816996813471
3597,12.123627728186323
2414,7.720781144138582

==> abalone-submission-p-gp.csv <==
id,rings
1512,9.91204833984375
1401,10.514533996582031
509,15.443496704101562
2883,7.8646697998046875
3259,11.645833015441895
2135,7.863286018371582
1499,8.795156955718994
3597,8.822440147399902
2414,9.293121337890625

==> abalone-submission-p-knn.csv <==
id,rings
1512,10.4
1401,9.6
509,12.2
2883,8.4
3259,14.0
2135,8.2
1499,9.6
3597,10.6
2414,6.8

==> abalone-submission-p-linreg.csv <==
id,rings
1512,8.953125
1401,11.21875
509,11.5
2883,8.359375
3259,11.953125
2135,9.390625
1499,9.53125
3597,12.359375
2414,8.14062