## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# importar os pacotes necessários para os algoritmos de regressão
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.ensemble import VotingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [12]:
sufixo_arquivo = 'o9-0.50'

## Carga dos dados de entrada

In [13]:
# carregar arquivo de dados de treino
train_data = pd.read_csv('abalone-train-' + sufixo_arquivo + '.csv', index_col='id')
#train_data = pd.read_csv('abalone-train.csv', index_col='id')

In [4]:
# carregar arquivo de dados de teste
test_data = pd.read_csv('abalone-test.csv', index_col='id')

## Transformações nos dados

In [5]:
data_sex = pd.get_dummies(train_data['sex'], prefix='sex')
train_data = train_data.join(data_sex)
train_data.drop('sex', axis=1, inplace=True)

data_sex = pd.get_dummies(test_data['sex'], prefix='sex')
test_data = test_data.join(data_sex)
test_data.drop('sex', axis=1, inplace=True)

## Seleção dos dados de treino e teste

In [6]:
# definir dados de treino

X_train = train_data.drop(['rings'], axis=1) # tudo, exceto a coluna alvo
y_train = train_data['rings'] # apenas a coluna alvo

print('Forma dos dados de treino:', X_train.shape, y_train.shape)

Forma dos dados de treino: (2763, 10) (2763,)


In [7]:
# definir dados de teste

X_test = test_data # tudo, já que não possui a coluna alvo

print('Forma dos dados de teste:', X_test.shape)

Forma dos dados de teste: (1393, 10)


## Treinamento dos modelos e geração dos resultados 

In [8]:
models = []

# Generalized Linear Models
models.append(('LinReg', LinearRegression(n_jobs=2, fit_intercept=False, normalize=True)))
models.append(('LogReg', LogisticRegression(n_jobs=2, random_state=42, multi_class='auto', C=1000, solver='newton-cg')))
models.append(('OMP', OrthogonalMatchingPursuit(n_nonzero_coefs=8, fit_intercept=True, normalize=True)))
models.append(('PAR', PassiveAggressiveRegressor(random_state=42, C=0.1, fit_intercept=True, max_iter=1000, tol=0.001)))
models.append(('PP', Perceptron(random_state=42, penalty='l2', alpha=1e-5, fit_intercept=True, max_iter=1000, tol=1e-3)))
models.append(('RANSAC', RANSACRegressor(random_state=42, min_samples=0.75)))
models.append(('Ridge', Ridge(random_state=42, alpha=0.1, fit_intercept=False, normalize=True)))
models.append(('SGD', SGDRegressor(random_state=42, alpha=1e-06, fit_intercept=True, penalty=None, tol=1e-3)))
models.append(('TSR', TheilSenRegressor(random_state=42, n_jobs=2, fit_intercept=True)))

# Decision Trees
models.append(('DTR', DecisionTreeRegressor(random_state=42, max_depth=4, min_samples_split=0.25)))

# Gaussian Processes
models.append(('GPR', GaussianProcessRegressor(random_state=42, alpha=0.01, normalize_y=True)))

# Kernel Ridge Regression
models.append(('KRR', KernelRidge()))

# Naïve Bayes
models.append(('GNB', GaussianNB(var_smoothing=0.1)))

# Nearest Neighbors
models.append(('kNN', KNeighborsRegressor(n_jobs=2, n_neighbors=13, weights='distance')))

# Support Vector Machines
models.append(('SVM', SVR(gamma='auto', kernel='linear')))

# Neural network models
models.append(('MLP', MLPRegressor(random_state=42, max_iter=500,
                     activation='logistic', hidden_layer_sizes=(50,), solver='lbfgs')))

# Ensemble Methods
models.append(('RFR', RandomForestRegressor(random_state=42, n_jobs=2, n_estimators=100, max_depth=7)))
models.append(('GBR', GradientBoostingRegressor(random_state=42, learning_rate=0.05, n_estimators=100,
                                  subsample=0.4, max_depth=6, max_features=10)))
models.append(('ETR', ExtraTreesRegressor(random_state=42, n_jobs=2, n_estimators=200, max_features=5)))
models.append(('BDTR', BaggingRegressor(random_state=42, n_jobs=2, base_estimator=DecisionTreeRegressor())))
models.append(('ABDTR', AdaBoostRegressor(random_state=42, n_estimators=100, base_estimator=DecisionTreeRegressor())))

# XGBoost
models.append(('XGBR', XGBRegressor(random_state=42, n_jobs=2, learning_rate=0.1,
                                    n_estimators=50, max_depth=5, objective='reg:squarederror')))

In [9]:
!mkdir submissions

mkdir: cannot create directory ‘submissions’: File exists


In [10]:
for name, model in models:
    print(model, '\n')
    
    # treinar o modelo
    model.fit(X_train, y_train)
    
    # executar previsão usando o modelo
    y_pred = model.predict(X_test)
    
    # gerar dados de envio (submissão)
    submission = pd.DataFrame({
      'id': X_test.index,
      'rings': y_pred
    })
    submission.set_index('id', inplace=True)

    # gerar arquivo CSV para o envio
    filename = 'submissions/abalone-submission-p-%s-%s.csv' % (sufixo_arquivo, name.lower())
    submission.to_csv(filename)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=2, normalize=True) 

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto', n_jobs=2,
          penalty='l2', random_state=42, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False) 

OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=8,
             normalize=True, precompute='auto', tol=None) 

PassiveAggressiveRegressor(C=0.1, average=False, early_stopping=False,
              epsilon=0.1, fit_intercept=True, loss='epsilon_insensitive',
              max_iter=1000, n_iter=None, n_iter_no_change=5,
              random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False) 

Perceptron(alpha=1e-05, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=1000, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty='l2', random_state=

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [11]:
# verificar conteúdo dos arquivos gerados
!head submissions/abalone-submission-p-*.csv

==> submissions/abalone-submission-p-abdtr.csv <==
id,rings
1512,10.0
1401,11.0
509,11.0
2883,8.0
3259,11.0
2135,9.0
1499,10.0
3597,11.0
2414,8.0

==> submissions/abalone-submission-p-bdtr.csv <==
id,rings
1512,9.5
1401,10.5
509,11.1
2883,8.4
3259,11.9
2135,10.0
1499,10.0
3597,11.3
2414,9.8

==> submissions/abalone-submission-p-dtr.csv <==
id,rings
1512,10.481404958677686
1401,12.079136690647482
509,12.793814432989691
2883,7.406926406926407
3259,12.079136690647482
2135,9.669218989280244
1499,10.481404958677686
3597,12.079136690647482
2414,7.406926406926407

==> submissions/abalone-submission-p-etr.csv <==
id,rings
1512,10.675
1401,10.7
509,10.59
2883,8.37
3259,11.315
2135,8.65
1499,9.705
3597,11.715
2414,8.12

==> submissions/abalone-submission-p-gbr.csv <==
id,rings
1512,10.338712532548499
1401,11.504335292274668
509,10.805311071733177
2883,8.004956288265097
3259,10.8284434353022
2135,9.611951164989971
1499,9.490343285069773
359

2414,8.185487133810875

==> submissions/abalone-submission-p-par.csv <==
id,rings
1512,9.423727494819147
1401,11.683739804631518
509,10.951056411762051
2883,8.299475032694346
3259,11.984070029213452
2135,9.923298633015698
1499,10.354246549855338
3597,13.005036867865687
2414,7.875292486230166

==> submissions/abalone-submission-p-pp.csv <==
id,rings
1512,11
1401,11
509,11
2883,11
3259,11
2135,8
1499,11
3597,11
2414,11

==> submissions/abalone-submission-p-ransac.csv <==
id,rings
1512,8.9609375
1401,10.8671875
509,10.8046875
2883,8.078125
3259,11.3984375
2135,9.2578125
1499,9.484375
3597,11.890625
2414,7.765625

==> submissions/abalone-submission-p-rfr.csv <==
id,rings
1512,10.227493646989807
1401,11.117258936675807
509,11.397727250022173
2883,8.213205611292391
3259,10.97182040352264
2135,9.402110251250212
1499,9.798025811976826
3597,11.799277786967892
2414,8.158656217183195

==> submissions/abalone-submission-p-ridge.csv <==
id,rings
1