# Pipelining and Base Model

In [1]:
import numpy as np
import pandas as pd
import help_functions as hf

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('RTA Dataset.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [5]:
preprocess = FunctionTransformer(hf.preprocess)
label_encode = FunctionTransformer(hf.label_encoder)
ohe = OneHotEncoder(sparse=False)
knn_imputer = KNNImputer()
k_best = SelectKBest(k=10)
pca = PCA(n_components=10)
base_model = LogisticRegression(multi_class='multinomial', max_iter=1000)

In [32]:
ohe = make_column_transformer(
    (ohe, hf.cat_col_without_order)
)

In [14]:
main_pipe_k_best = Pipeline([
        ('preprocess', preprocess),                 
        ('label_encode', label_encode),
        ('ohe', ohe),        
        ('impute', knn_imputer),
        ('k_best', k_best),
        ('model', base_model),
    ])

main_pipe_pca = Pipeline([
        ('preprocess', preprocess),                 
        ('label_encoder', label_encode),
        ('ohe', ohe),        
        ('impute', knn_imputer),
        ('pca', pca),
        ('model', base_model),
    ])

In [15]:
from sklearn import set_config
set_config(display='diagram')

main_pipe_k_best

In [16]:
main_pipe_k_best.fit(X_train, y_train)

In [17]:
y_pred = main_pipe_k_best.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.8451962110960758

In [20]:
main_pipe_pca.fit(X_train, y_train)

In [22]:
y_pred = main_pipe_pca.predict(X_test)

In [23]:
accuracy_score(y_pred, y_test)

0.8457374830852503