In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

In [3]:
train_data = pd.read_csv('Insol_3.csv',sep=';',header=0)

# Remove rows with missing target values
train_data.dropna(axis=0, subset=['Insolvencia'], inplace=True)
y = train_data.Insolvencia # Target variable
train_data.drop(['Insolvencia'], axis=1, inplace=True) # Removing target variable from training data


# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

X.head() # Show first 5 training examples

Shape of input data: (2738, 32) and shape of target variable: (2738,)


Unnamed: 0,Distrito,NACE1,NACE2,Numero_Empregados,Num_anos_disponiveis_base,Variacao_das_vendas,Variacao_resultado_operacional,Variacao_do_activo,Variacao_dos_capitais_proprios,Variacao_do_ativo_circulante,...,Custos_dos_encargos_financeiros_dividir_resultado_operacional,Rendibilidade_operacional_vendas,Rendibilidade_liquida_das_vendas,Rendibilidade_do_ativo,Rendibilidade_capitais_proprios,Passivo_curto_prazo_dividir_vendas,Peso_das_amortizacoes_dividir_vendas,Peso_encargos_financeiros_dividir_vendas,Produtividade_por_trabalhador,Certificacao_legal_contas
0,1,14,141,225,9,-0.055116,-12.183443,-0.081616,-0.353879,-0.464741,...,-9.693646,-0.123985,-0.139525,-0.162146,-0.353879,0.904606,0.040788,0.01279,0.905354,1
1,1,13,139,28,10,-0.163781,-7.685481,-0.12059,0.002009,-0.121442,...,-0.807625,-0.004936,0.000886,0.001328,0.002009,0.103359,0.025632,0.006112,1.330416,1
2,5,14,141,80,9,-0.193743,-1.48674,-0.118888,0.055864,-0.123646,...,-17.256303,-0.015714,0.007059,0.028844,0.055865,0.100136,0.007319,0.000911,1.205843,1
3,5,14,141,175,8,-0.156787,-1.366234,-0.122807,-0.109388,-0.127069,...,-1.731467,-0.021469,-0.059088,-0.092117,-0.109388,0.422792,0.016605,0.012399,0.929479,1
4,1,13,139,140,10,-0.059913,-0.742674,-0.019612,0.002116,-0.041525,...,4.74787,0.046388,0.054399,0.033852,0.038284,0.223548,0.094521,0.00977,1.774669,1


In [9]:
# Lets split the data into 5 folds.
# We will use this 'kf'(KFold splitting stratergy) object as input to cross_val_score() method
kf =KFold(n_splits=20, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 2601, Test set:137
Fold:2, Train set: 2601, Test set:137
Fold:3, Train set: 2601, Test set:137
Fold:4, Train set: 2601, Test set:137
Fold:5, Train set: 2601, Test set:137
Fold:6, Train set: 2601, Test set:137
Fold:7, Train set: 2601, Test set:137
Fold:8, Train set: 2601, Test set:137
Fold:9, Train set: 2601, Test set:137
Fold:10, Train set: 2601, Test set:137
Fold:11, Train set: 2601, Test set:137
Fold:12, Train set: 2601, Test set:137
Fold:13, Train set: 2601, Test set:137
Fold:14, Train set: 2601, Test set:137
Fold:15, Train set: 2601, Test set:137
Fold:16, Train set: 2601, Test set:137
Fold:17, Train set: 2601, Test set:137
Fold:18, Train set: 2601, Test set:137
Fold:19, Train set: 2602, Test set:136
Fold:20, Train set: 2602, Test set:136


In [10]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

In [14]:
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Model Linear Regression Scores for each fold: {score}')
rmse(score.mean())

Model Linear Regression Scores for each fold: [ -0.96792485  -0.08811693  -0.15836566  -0.11152822 -12.90455089
  -0.19543304  -0.6076594   -0.47165354  -0.11007163  -0.07758305
  -0.10847733  -0.10298012  -0.10231813  -0.08915443  -0.10255271
  -0.08955146  -0.11384491  -0.12313776  -0.08434705  -0.08548616]
rmse= 0.91


In [12]:
score = cross_val_score(tree.DecisionTreeRegressor(random_state= 42), X, y, cv=kf, scoring="neg_mean_squared_error")
print(f'Model DecisionTreeRegressor Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-0.05109489 -0.02919708 -0.02919708 -0.00729927 -0.04379562 -0.03649635
 -0.01459854 -0.02919708 -0.06569343 -0.06569343 -0.06569343 -0.04379562
 -0.02189781 -0.04379562 -0.03649635 -0.04379562 -0.02189781 -0.05109489
 -0.04411765 -0.04411765]
rmse= 0.20


In [13]:
score = cross_val_score(ensemble.RandomForestRegressor(random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Model Random Forest Regressor Scores for each fold are: {score}')
rmse(score.mean())

Scores for each fold are: [-0.02418248 -0.01627372 -0.01131095 -0.00409124 -0.03311679 -0.02338248
 -0.00416131 -0.03031095 -0.01776058 -0.02092409 -0.04059708 -0.02764088
 -0.00993504 -0.01558613 -0.01559854 -0.02614234 -0.01004964 -0.03348394
 -0.01476176 -0.00709706]
rmse= 0.14


In [15]:
#Decision Tree Regressor Tuning
max_depth = [1,2,3,4,5,6,7,8,9,10]

for val in max_depth:
    score = cross_val_score(tree.DecisionTreeRegressor(max_depth= val, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For max depth: {val}')
    rmse(score.mean())

For max depth: 1
rmse= 0.13
For max depth: 2
rmse= 0.14
For max depth: 3
rmse= 0.14
For max depth: 4
rmse= 0.14
For max depth: 5
rmse= 0.15
For max depth: 6
rmse= 0.15
For max depth: 7
rmse= 0.16
For max depth: 8
rmse= 0.16
For max depth: 9
rmse= 0.17
For max depth: 10
rmse= 0.17


In [16]:
#Random Forest Regressor Tuning
estimators = [50, 100, 150, 200, 250, 300, 350]

for count in estimators:
    score = cross_val_score(ensemble.RandomForestRegressor(n_estimators= count, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For estimators: {count}')
    rmse(score.mean())

For estimators: 50
rmse= 0.14
For estimators: 100
rmse= 0.14
For estimators: 150
rmse= 0.14


KeyboardInterrupt: 

In [20]:
pip install pycaret

^C
Note: you may need to restart the kernel to use updated packages.


In [18]:
best_model = compare_models(sort='Profit')


NameError: name 'compare_models' is not defined