## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# importar os pacotes necessários para os algoritmos de regressão
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.ensemble import VotingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

## Carga dos dados de entrada

In [3]:
# carregar arquivo de dados de treino
train_data = pd.read_csv('abalone-train.csv', index_col='id')

In [4]:
# carregar arquivo de dados de teste
test_data = pd.read_csv('abalone-test.csv', index_col='id')

## Transformações nos dados

In [5]:
data_sex = pd.get_dummies(train_data['sex'], prefix='sex')
train_data = train_data.join(data_sex)
train_data.drop('sex', axis=1, inplace=True)

data_sex = pd.get_dummies(test_data['sex'], prefix='sex')
test_data = test_data.join(data_sex)
test_data.drop('sex', axis=1, inplace=True)

In [7]:
data = train_data
outliers = np.concatenate((
    data[data.height == 0.0].index,
    data[(data['viscera_weight'] > 0.5) & (data['rings'] < 20 - 1.5)].index,
    data[(data['viscera_weight'] < 0.5) & (data['rings'] > 25 - 1.5)].index,
    data[(data['shell_weight'] > 0.6) & (data['rings'] < 25- 1.5)].index,
    data[(data['shell_weight'] < 0.8) & (data['rings'] > 25- 1.5)].index,
    data[(data['shucked_weight'] >= 1.0) & (data['rings'] < 20- 1.5)].index,
    data[(data['shucked_weight'] < 1.0)  & (data['rings'] > 20- 1.5)].index,
    data[(data['whole_weight'] >= 2.5) & (data['rings'] < 25- 1.5)].index,
    data[(data['whole_weight'] < 2.5)  & (data['rings'] > 25- 1.5)].index,
    data[(data['diameter'] < 0.1)  & (data['rings'] < 5- 1.5)].index,
    data[(data['diameter'] < 0.6)  & (data['rings'] > 25- 1.5)].index,
    data[(data['diameter'] >= 0.6) & (data['rings'] < 25- 1.5)].index,
    data[(data['height'] > 0.4) & (data['rings'] < 15- 1.5)].index,
    data[(data['height'] < 0.4) & (data['rings'] > 25- 1.5)].index,
    data[(data['length'] < 0.1)  & (data['rings'] < 5- 1.5)].index,
    data[(data['length'] < 0.8)  & (data['rings'] > 25- 1.5)].index,
    data[(data['length'] >= 0.8) & (data['rings'] < 25- 1.5)].index
), axis=0)
train_data.drop(outliers, inplace=True)

## Seleção dos dados de treino e teste

In [8]:
# definir dados de treino

X_train = train_data.drop(['rings'], axis=1) # tudo, exceto a coluna alvo
y_train = train_data['rings'] # apenas a coluna alvo

print('Forma dos dados de treino:', X_train.shape, y_train.shape)

Forma dos dados de treino: (2672, 10) (2672,)


In [9]:
# definir dados de teste

X_test = test_data # tudo, já que não possui a coluna alvo

print('Forma dos dados de teste:', X_test.shape)

Forma dos dados de teste: (1393, 10)


## Treinamento dos modelos e geração dos resultados 

In [10]:
models = []
models.append(('LinReg', LinearRegression(n_jobs=2, fit_intercept=False, normalize=True)))
#models.append(('LogReg', LogisticRegression(random_state=42, n_jobs=2, multi_class='auto', C=1000, solver='newton-cg')))
models.append(('Ridge', Ridge(random_state=42, alpha=0.1, fit_intercept=False, normalize=True)))
models.append(('KNN', KNeighborsRegressor(n_jobs=2, n_neighbors=13, weights='distance')))
models.append(('SVM', SVR(gamma='auto', kernel='linear')))
models.append(('RF', RandomForestRegressor(random_state=42, n_jobs=2, n_estimators=100, max_depth=7)))
models.append(('GB', GradientBoostingRegressor(random_state=42)))
models.append(('DT', DecisionTreeRegressor(random_state=42)))
models.append(('GP', GaussianProcessRegressor()))
models.append(('MLP', MLPRegressor(random_state=42)))
models.append(('OMP', OrthogonalMatchingPursuit(n_nonzero_coefs=8, fit_intercept=True, normalize=True)))
models.append(('PA', PassiveAggressiveRegressor(random_state=42, C=0.2, fit_intercept=True)))
models.append(('RANSAC', RANSACRegressor(random_state=42, min_samples=0.75)))
models.append(('SGD', SGDRegressor(random_state=42, alpha=1e-06, fit_intercept=False, penalty=None)))
models.append(('TS', TheilSenRegressor(random_state=42, n_jobs=2, fit_intercept=True)))

In [11]:
!mkdir submissions

mkdir: cannot create directory ‘submissions’: File exists


In [12]:
for name, model in models:
    print(model, '\n')
    
    # treinar o modelo
    model.fit(X_train, y_train)
    
    # executar previsão usando o modelo
    y_pred = model.predict(X_test)
    
    # gerar dados de envio (submissão)
    submission = pd.DataFrame({
      'id': X_test.index,
      'rings': y_pred
    })
    submission.set_index('id', inplace=True)

    # gerar arquivo CSV para o envio
    filename = 'submissions/abalone-submission-p-%s.csv' % name.lower()
    submission.to_csv(filename)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=2, normalize=True) 

Ridge(alpha=0.1, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=True, random_state=42, solver='auto', tol=0.001) 

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=2, n_neighbors=13, p=2,
          weights='distance') 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
           oob_score=False, random_state=42, verbose=0, warm_start=False) 

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learnin



OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=8,
             normalize=True, precompute='auto', tol=None) 

PassiveAggressiveRegressor(C=0.2, average=False, early_stopping=False,
              epsilon=0.1, fit_intercept=True, loss='epsilon_insensitive',
              max_iter=None, n_iter=None, n_iter_no_change=5,
              random_state=42, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False) 

RANSACRegressor(base_estimator=None, is_data_valid=None, is_model_valid=None,
        loss='absolute_loss', max_skips=inf, max_trials=100,
        min_samples=0.75, random_state=42, residual_threshold=None,
        stop_n_inliers=inf, stop_probability=0.99, stop_score=inf) 

SGDRegressor(alpha=1e-06, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=False, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty=None, power_t=0.2



In [14]:
# verificar conteúdo dos arquivos gerados
!head submissions/abalone-submission-p-*.csv

==> submissions/abalone-submission-p-dt.csv <==
id,rings
1512,9.0
1401,11.0
509,11.0
2883,8.0
3259,12.0
2135,11.0
1499,10.0
3597,15.0
2414,8.0

==> submissions/abalone-submission-p-gb.csv <==
id,rings
1512,10.076760285156443
1401,11.5551225922323
509,12.040373529603869
2883,7.803899387826646
3259,10.87935913775713
2135,9.836501468705915
1499,9.89609942191297
3597,11.755980157105101
2414,7.815914799984401

==> submissions/abalone-submission-p-gp.csv <==
id,rings
1512,10.58349609375
1401,9.832375526428223
509,15.272384643554688
2883,7.8772430419921875
3259,10.749980926513672
2135,7.946163177490234
1499,8.839645385742188
3597,6.869190216064453
2414,9.377166748046875

==> submissions/abalone-submission-p-knn.csv <==
id,rings
1512,10.450920122076752
1401,10.547940060237595
509,11.000246426623573
2883,8.294157251191635
3259,11.811687671045604
2135,8.334921659332283
1499,9.948263505915676
3597,11.613391575547311
2414,7.302274816638973

==> submissions/abalone-submission-p-linreg.csv <==
id,ri