<a href="https://colab.research.google.com/github/juanprida/projects/blob/main/ML_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

class pipeline():
  
    def __init__(self, train_path, test_path, target, folds):
        self.pdf_train = pd.read_csv(train_path)
        self.pdf_test = pd.read_csv(test_path)
        self.target = target
        self.folds = folds
    
    def concat_dfs(self, pdf_train, pdf_test, target_test = -999999999):
        pdf_test_with_target = pdf_test.copy()
        pdf_test_with_target[self.target] = target_test
        return pd.concat([pdf_train, pdf_test_with_target])

    def revert_concat(self, pdf_all_data, target_test = -999999999):
        pdf_train_with_cats = pdf_all_data[pdf_all_data[self.target] != target_test]
        pdf_test_with_cats = (pdf_all_data[pdf_all_data[self.target] == target_test]
                              .drop(self.target, axis=1))
        return pdf_train_with_cats, pdf_test_with_cats  

    def encode_vars(self, pdf_train, pdf_test, cats_variables, ohe_variables):
        
        pdf_all_data = self.concat_dfs(pdf_train, pdf_test)

        for cat in cats_variables:
            pdf_all_data[cat] = pdf_all_data[cat].astype('category')
            pdf_all_data[f'{cat}_encoded'] = pdf_all_data[cat].cat.codes
    
        for cat in ohe_variables:
            pdf_encoded = pd.get_dummies(pdf_all_data[cat], prefix = cat)
            pdf_all_data = pdf_all_data.join(pdf_encoded)        
       
        return self.revert_concat(pdf_all_data)

    def fill_na(self, pdf_input, strategy={'mean_cols': [], 'constant_cols': []}):
        pdf = pdf_input.copy()
        for c in strategy['mean_cols']:
            pdf[c].fillna(pdf[c].mean(), inplace=True)
        for c in strategy['constant_cols']:
            pdf[c].fillna(0, inplace=True)
        return pdf

    def create_random_folds(self, pdf):
        pdf['kfold'] = -1
        pdf = pdf.sample(frac=1, random_state = 0).reset_index(drop=True)
        
        num_bins = int(np.floor(1 + np.log2(len(pdf))))
        pdf['bins'] = pd.cut(pdf[self.target], bins=num_bins, labels=False)

        kf = StratifiedKFold(n_splits=self.folds)
        for f, (t_, v_) in enumerate(kf.split(X=pdf, y=pdf.bins.values)):
            pdf.loc[v_, 'kfold'] = f
        pdf = pdf.drop('bins', axis = 1)
        return pdf

    def split_x_y(self, pdf, fold):
        pdf_train = pdf[pdf['kfold'] != fold].copy()
        X_train = pdf_train.drop(['kfold', self.target], axis=1).values
        y_train = pdf_train[self.target].values

        pdf_val = pdf[pdf['kfold'] == fold].copy()
        X_val = pdf_val.drop(['kfold', self.target], axis=1).values
        y_val = pdf_val[self.target].values

        return X_train, y_train, X_test, y_test
    
    def compute_regression(self, model, pdf, folds=self.folds):
        for fold in folds:
            X_train, y_train, X_val, y_val = self.split_x_y(pdf, fold)
            regressor = model
            regressor.fit(X_train, y_train)

            y_train_hat = regressor.predict(X_train)
            y_val_hat = regressor.predict(X_val)

            pdf_tpreds = pd.DataFrame({'y_train': y_train,
                                       'y_train_hat': y_train_hat
                                       'kfold': fold)
            pdf_vpreds = pd.DataFrame({'y_val': y_val,
                                       'y_val_hat': y_val_hat,
                                       'kfold': fold)
            




In [58]:
pd_check = pd.read_csv('/content/train.csv')
pd_check.shape

(6036000, 8)

In [70]:
train_path = '/content/train.csv'
test_path = '/content/test.csv'
target = 'pressure'
folds = 5

pipe = pipeline(train_path, 
                test_path,
                target,
                folds)

df = pipe.encode_vars(pipe.pdf_train, pipe.pdf_test, ['u_out'], [])
fill_vars = [c for c in pipe.pdf_train.columns]
df = pipe.fill_na(pipe.pdf_train, 
            strategy = {'mean_cols': [], 'constant_cols': fill_vars})

df = pipe.create_random_folds(df)

In [71]:
X, y = pipe.split_x_y(df, 0)

In [72]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=2, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

# New Section

In [None]:
proces.pdf_train

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0.0,5.837492
1,2,1,20,50,0.033652,18.383041,0.0,5.907794
2,3,1,20,50,0.067514,22.509278,0.0,7.876254
3,4,1,20,50,0.101542,22.808822,0.0,11.742872
4,5,1,20,50,0.135756,25.355850,0.0,12.234987
...,...,...,...,...,...,...,...,...
164781,164782,3364,20,50,2.075660,4.757002,1.0,6.821722
164782,164783,3364,20,50,2.109673,4.795004,1.0,6.399909
164783,164784,3364,20,50,2.143661,4.827042,1.0,6.610815
164784,164785,3364,20,50,2.177742,4.854140,1.0,6.681117


In [None]:
pda, pdb = proces.encode_vars()

In [None]:
pda

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,breath_id_encoded
0,1,1,20,50,0.000000,0.083334,0,5.837492,1
1,2,1,20,50,0.033652,18.383041,0,5.907794,1
2,3,1,20,50,0.067514,22.509278,0,7.876254,1
3,4,1,20,50,0.101542,22.808822,0,11.742872,1
4,5,1,20,50,0.135756,25.355850,0,12.234987,1
...,...,...,...,...,...,...,...,...,...
4649256,4649257,96897,5,50,1.906103,4.348718,1,5.626585,96897
4649257,4649258,96897,5,50,1.940123,4.450591,1,6.470211,96897
4649258,4649259,96897,5,50,1.974355,4.537021,1,5.696887,96897
4649259,4649260,96897,5,50,2.008377,4.609444,1,5.837492,96897


In [None]:
proces.pdf_train

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0.0,5.837492
1,2,1,20,50,0.033652,18.383041,0.0,5.907794
2,3,1,20,50,0.067514,22.509278,0.0,7.876254
3,4,1,20,50,0.101542,22.808822,0.0,11.742872
4,5,1,20,50,0.135756,25.355850,0.0,12.234987
...,...,...,...,...,...,...,...,...
4074492,4074493,84974,5,50,0.408183,15.107524,0.0,8.508973
4074493,4074494,84974,5,50,0.442256,0.000000,0.0,10.477433
4074494,4074495,84974,5,50,0.476140,13.011057,0.0,8.719880
4074495,4074496,84974,5,50,0.510206,0.000000,0.0,10.547735


In [None]:
processing.pdf_test

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0.0
1,2,0,5,20,0.031904,7.515046,0.0
2,3,0,5,20,0.063827,14.651675,0.0
3,4,0,5,20,0.095751,21.230610,0.0
4,5,0,5,20,0.127644,26.320956,0.0
...,...,...,...,...,...,...,...
1597607,1597608,50082,5,20,0.223402,53.481528,0.0
1597608,1597609,50082,5,20,0.255334,49.995149,0.0
1597609,1597610,50082,5,20,0.287210,46.150856,0.0
1597610,1597611,50082,5,20,0.319105,42.300599,0.0
