<a href="https://colab.research.google.com/github/harnalashok/classification/blob/main/4_otto_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://www.kaggle.com/hsperr/finding-ensamble-weights

In [11]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import os


In [4]:
os.chdir("/content")

In [5]:
train = pd.read_csv("train.csv.zip")
labels = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)

In [14]:
ll = LabelEncoder()
labels = ll.fit_transform(labels)

In [19]:
sss = StratifiedShuffleSplit(n_splits = 1, test_size=0.05, random_state=1234)

In [20]:
for train_index, test_index in sss.split(train, labels):
    break

In [24]:
train_x, train_y = train.loc[train_index,:], labels[train_index]
test_x, test_y = train.loc[test_index,:], labels[test_index]

In [32]:
### building the classifiers
clfs = []

In [33]:
rfc = RandomForestClassifier(n_estimators=50, random_state=4141, n_jobs=-1)
rfc.fit(train_x, train_y)
print('RFC LogLoss {score}'.format(score=log_loss(test_y, rfc.predict_proba(test_x))))
clfs.append(rfc)

RFC LogLoss 0.6851102151214558


In [34]:
logreg = LogisticRegression(max_iter = 8000)
logreg.fit(train_x, train_y)
print('LogisticRegression LogLoss {score}'.format(score=log_loss(test_y, logreg.predict_proba(test_x))))
clfs.append(logreg)

LogisticRegression LogLoss 0.6481986200331104


In [35]:
rfc2 = RandomForestClassifier(n_estimators=50, random_state=1337, n_jobs=-1)
rfc2.fit(train_x, train_y)
print('RFC2 LogLoss {score}'.format(score=log_loss(test_y, rfc2.predict_proba(test_x))))
clfs.append(rfc2)


RFC2 LogLoss 0.6541558264501615


In [36]:
clfs

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                        oob_score=False, random_state=4141, verbose=0,
                        warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=8000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini'

In [37]:
predictions = []
for clf in clfs:
    predictions.append(clf.predict_proba(test_x))

In [40]:
len(predictions)  # 3
predictions[0]

array([[0.  , 0.02, 0.14, ..., 0.  , 0.36, 0.46],
       [0.  , 0.58, 0.22, ..., 0.02, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 1.  , 0.  ],
       ...,
       [0.02, 0.02, 0.02, ..., 0.  , 0.16, 0.26],
       [0.  , 0.48, 0.42, ..., 0.  , 0.  , 0.02],
       [0.06, 0.  , 0.  , ..., 0.  , 0.08, 0.84]])

In [49]:
final_prediction = 0
weights = [0.2,0.3,0.4]
for weight, prediction in zip(weights, predictions):
  print(weight)
  print("\n\n---prediction-----\n")
  print(np.round(prediction,3)[:3,:])
  final_prediction += weight*prediction
  print("\n\n---final pred---------\n")
  print(np.round(final_prediction,3)[:3,:])
  print("\n***************************\n")

0.2


---prediction-----

[[0.   0.02 0.14 0.   0.   0.02 0.   0.36 0.46]
 [0.   0.58 0.22 0.16 0.   0.02 0.02 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.   0.  ]]


---final pred---------

[[0.    0.004 0.028 0.    0.    0.004 0.    0.072 0.092]
 [0.    0.116 0.044 0.032 0.    0.004 0.004 0.    0.   ]
 [0.    0.    0.    0.    0.    0.    0.    0.2   0.   ]]

***************************

0.3


---prediction-----

[[0.    0.    0.    0.    0.    0.    0.    0.066 0.933]
 [0.    0.498 0.255 0.234 0.001 0.003 0.007 0.001 0.001]
 [0.    0.    0.001 0.    0.    0.    0.    0.986 0.012]]


---final pred---------

[[0.    0.004 0.028 0.    0.    0.004 0.    0.092 0.372]
 [0.    0.266 0.12  0.102 0.    0.005 0.006 0.    0.   ]
 [0.    0.    0.    0.    0.    0.    0.    0.496 0.004]]

***************************

0.4


---prediction-----

[[0.02 0.08 0.04 0.04 0.   0.   0.   0.26 0.56]
 [0.   0.6  0.26 0.12 0.   0.   0.02 0.   0.  ]
 [0.   0.04 0.02 0.   0.   0.02 0.   0.92 0.  ]]


--

In [54]:
def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(test_y, final_prediction)
    

In [55]:
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)

In [56]:
#adding constraints  and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = (
         {'type':'eq',
          'fun':lambda w: 1-sum(w)
          }
        )

In [57]:
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)

In [58]:
bounds

[(0, 1), (0, 1), (0, 1)]

In [59]:
res = minimize(
                  log_loss_func,
                  starting_values,
                  method='SLSQP',
                  bounds=bounds,
                  constraints=cons
               )

In [60]:
print('Ensamble Score: {best_score}'.format(
                                             best_score=res['fun']
                                            )
    )


Ensamble Score: 0.5517580125653778


In [61]:
print(
      'Best Weights: {weights}'.format(
                                        weights=res['x']
                                       )
      )

Best Weights: [0.29893736 0.35007235 0.35099029]


In [None]:
########### I am done #################