In [172]:
%matplotlib inline

In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, KernelPCA, SparsePCA
from sklearn.datasets import make_classification
from sklearn.linear_model import SGDClassifier
from itertools import combinations
import ast


In [98]:
pipe_dataset = pd.read_csv('14_16_pre_Encoder.csv')
# pca_test_data = pca_kernel.fit_transform(pipe_dataset)

In [99]:
cat_col = ['Gender', 'family_history', 'no_employees', 'remote_work', 'benefits', 'care_options', 'wellness_program', 'anonymity', 'leave', 
           'phys_health_consequence', 'coworkers', 'supervisor', 'phys_health_interview', 'mental_vs_physical', 'work_treatment_interfere', 
           'work_no_treatment_interfere', 'Continent']
              
num_col = ['Age']
target='treatment'

In [100]:
used_cols = [i for i in pipe_dataset.columns.tolist() if i not in [target]]
pl_X=pipe_dataset[used_cols]
pl_y=pipe_dataset[target]
pl_X_train, pl_X_test, pl_y_train, pl_y_test = train_test_split(pl_X, pl_y, test_size=0.2, random_state=42)

In [101]:
pl_scaler = StandardScaler()
pl_encoder = OneHotEncoder()
# putting numeric columns to scaler and categorical to encoder
num_transformer = make_pipeline(pl_scaler)
cat_transformer = make_pipeline(pl_encoder)
# getting together our scaler and encoder with preprocessor
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, num_col),
                    ('cat', cat_transformer, cat_col)])


# No PCA

## SVC

In [102]:
# choosing model SVC
model_name_svc =SVC(C=0.05, class_weight='balanced', degree=2, kernel='poly', probability=True, random_state=42, verbose=True)

# giving all values to pipeline
pipe_svc = make_pipeline(preprocessor, model_name_svc)
pipe_svc.fit(pl_X_train, pl_y_train)

[LibSVM]

In [103]:
# make predictions on training set
pl_y_pred_svc = pipe_svc.predict(pl_X_train)

# make predictions on test set
pl_y_pred_test_svc = pipe_svc.predict(pl_X_test)

# Random Forest

In [104]:
# choosing model Random Forest
model_name_rf =RandomForestClassifier(max_depth=2, max_features='log2', max_leaf_nodes=9,
                       max_samples=0.25, n_estimators=75, random_state=42)

# giving all values to pipeline
pipe_rf = make_pipeline(preprocessor,model_name_rf)
pipe_rf.fit(pl_X_train, pl_y_train)

In [105]:
# make predictions on training set
pl_y_pred_rf = pipe_rf.predict(pl_X_train)

# make predictions on test set
pl_y_pred_test_rf = pipe_rf.predict(pl_X_test)

In [106]:
d_rf= {'Recall TRAIN:' : metrics.recall_score(pl_y_train, pl_y_pred_rf), 'Recall TEST:' :metrics.recall_score(pl_y_test, pl_y_pred_test_rf),
       'F1 score TRAIN': metrics.f1_score(pl_y_train, pl_y_pred_rf), 'F1 score TEST' : metrics.f1_score(pl_y_test, pl_y_pred_test_rf),
      'Accuracy TRAIN' : accuracy_score(pl_y_train, pl_y_pred_rf), 'Accurasy TEST' : accuracy_score(pl_y_test, pl_y_pred_test_rf)}

d_svc= {'Recall TRAIN:' : metrics.recall_score(pl_y_train, pl_y_pred_svc), 'Recall TEST:' :metrics.recall_score(pl_y_test, pl_y_pred_test_svc),
       'F1 score TRAIN': metrics.f1_score(pl_y_train, pl_y_pred_svc), 'F1 score TEST' : metrics.f1_score(pl_y_test, pl_y_pred_test_svc),
      'Accuracy TRAIN' : accuracy_score(pl_y_train, pl_y_pred_svc), 'Accurasy TEST' : accuracy_score(pl_y_test, pl_y_pred_test_svc)}

In [107]:
pd.concat([pd.DataFrame(d_rf, columns=d_rf.keys(), index=['Random Forest']), pd.DataFrame(d_svc, columns=d_svc.keys(), index=['SVC'])])

Unnamed: 0,Recall TRAIN:,Recall TEST:,F1 score TRAIN,F1 score TEST,Accuracy TRAIN,Accurasy TEST
Random Forest,0.929432,0.9701,0.849057,0.878195,0.820477,0.848598
SVC,0.919966,0.943522,0.860709,0.873846,0.838242,0.846729


# With PCA

In [109]:
model_name_svc =SVC(C=0.05, class_weight='balanced', degree=2, kernel='poly', probability=True, random_state=42, verbose=True)

pipe_steps = [('scaler', preprocessor), ('pca',KernelPCA()), ('svm',model_name_svc)]
check_param = {'pca__n_components' : [i for i in range(1,6)]}
pipeline_pca_test = Pipeline(pipe_steps)

# create_grid = GridSearchCV(pipeline_pca_test, param_grid = check_param, cv = 5)
# create_grid.fit(pl_X_train, pl_y_train)

In [110]:
# create_grid.best_params_

## SVC with PCA

In [111]:
pipe_svc = make_pipeline(preprocessor, KernelPCA(n_components = 5), model_name_svc)
pipe_svc.fit(pl_X_train, pl_y_train)

[LibSVM]

In [112]:
# make predictions on training set
pl_y_pred_svc = pipe_svc.predict(pl_X_train)

# make predictions on test set
pl_y_pred_test_svc = pipe_svc.predict(pl_X_test)

## Random Forest with PCA 

In [113]:
pipe_rf = make_pipeline(preprocessor,KernelPCA(n_components = 5),model_name_rf)
pipe_rf.fit(pl_X_train, pl_y_train)
# make predictions on training set
pl_y_pred_rf = pipe_rf.predict(pl_X_train)

# make predictions on test set
pl_y_pred_test_rf = pipe_rf.predict(pl_X_test)

In [114]:
d_rf= {'Recall TRAIN:' : metrics.recall_score(pl_y_train, pl_y_pred_rf), 'Recall TEST:' :metrics.recall_score(pl_y_test, pl_y_pred_test_rf),
       'F1 score TRAIN': metrics.f1_score(pl_y_train, pl_y_pred_rf), 'F1 score TEST' : metrics.f1_score(pl_y_test, pl_y_pred_test_rf),
      'Accuracy TRAIN' : accuracy_score(pl_y_train, pl_y_pred_rf), 'Accurasy TEST' : accuracy_score(pl_y_test, pl_y_pred_test_rf)}

d_svc= {'Recall TRAIN:' : metrics.recall_score(pl_y_train, pl_y_pred_svc), 'Recall TEST:' :metrics.recall_score(pl_y_test, pl_y_pred_test_svc),
       'F1 score TRAIN': metrics.f1_score(pl_y_train, pl_y_pred_svc), 'F1 score TEST' : metrics.f1_score(pl_y_test, pl_y_pred_test_svc),
      'Accuracy TRAIN' : accuracy_score(pl_y_train, pl_y_pred_svc), 'Accurasy TEST' : accuracy_score(pl_y_test, pl_y_pred_test_svc)}

In [115]:
pd.concat([pd.DataFrame(d_rf, columns=d_rf.keys(), index=['Random Forest']), pd.DataFrame(d_svc, columns=d_svc.keys(), index=['SVC'])])

Unnamed: 0,Recall TRAIN:,Recall TEST:,F1 score TRAIN,F1 score TEST,Accuracy TRAIN,Accurasy TEST
Random Forest,0.835628,0.82392,0.804141,0.798712,0.778869,0.766355
SVC,0.349398,0.325581,0.457207,0.442438,0.549322,0.538318


# With PCA kernel = 'poly', degree = 2

In [116]:
preprocessor.fit_transform(pipe_dataset).shape

(2674, 68)

In [117]:
pipe_steps = [('scaler', preprocessor), ('pca', KernelPCA(kernel= "poly", degree= 2)), ('svm',model_name_svc)]
check_param = {'pca__n_components' : [i for i in range(1,68)]}
pipeline_pca_test = Pipeline(pipe_steps)


In [118]:
# create_grid = GridSearchCV(pipeline_pca_test, param_grid = check_param, cv = 5)
# create_grid.fit(pl_X_train, pl_y_train)


In [119]:
# create_grid.best_params_

## SVC with PCA Kernel

In [120]:
model_name_svc =SVC(C=0.05, class_weight='balanced', degree=2, kernel='linear', probability=True, random_state=42, verbose=True)

# giving all values to pipeline
pipe_svc = make_pipeline(preprocessor, KernelPCA(kernel = 'poly', degree = 2, n_components= 44), model_name_svc)
pipe_svc.fit(pl_X_train, pl_y_train)

[LibSVM]

In [121]:
# make predictions on training set
pl_y_pred_svc = pipe_svc.predict(pl_X_train)

# make predictions on test set
pl_y_pred_test_svc = pipe_svc.predict(pl_X_test)

## Random Forest with PCA Kernel

In [122]:
pipe_rf = make_pipeline(preprocessor,pca_kernel,model_name_rf)
pipe_rf.fit(pl_X_train, pl_y_train)

In [123]:
# make predictions on training set
pl_y_pred_rf = pipe_rf.predict(pl_X_train)

# make predictions on test set
pl_y_pred_test_rf = pipe_rf.predict(pl_X_test)

In [124]:
d_rf= {'Recall TRAIN:' : metrics.recall_score(pl_y_train, pl_y_pred_rf), 'Recall TEST:' :metrics.recall_score(pl_y_test, pl_y_pred_test_rf),
       'F1 score TRAIN': metrics.f1_score(pl_y_train, pl_y_pred_rf), 'F1 score TEST' : metrics.f1_score(pl_y_test, pl_y_pred_test_rf),
      'Accuracy TRAIN' : accuracy_score(pl_y_train, pl_y_pred_rf), 'Accurasy TEST' : accuracy_score(pl_y_test, pl_y_pred_test_rf)}

d_svc= {'Recall TRAIN:' : metrics.recall_score(pl_y_train, pl_y_pred_svc), 'Recall TEST:' :metrics.recall_score(pl_y_test, pl_y_pred_test_svc),
       'F1 score TRAIN': metrics.f1_score(pl_y_train, pl_y_pred_svc), 'F1 score TEST' : metrics.f1_score(pl_y_test, pl_y_pred_test_svc),
      'Accuracy TRAIN' : accuracy_score(pl_y_train, pl_y_pred_svc), 'Accurasy TEST' : accuracy_score(pl_y_test, pl_y_pred_test_svc)}

In [125]:
pd.concat([pd.DataFrame(d_rf, columns=d_rf.keys(), index=['Random Forest']), pd.DataFrame(d_svc, columns=d_svc.keys(), index=['SVC'])])

Unnamed: 0,Recall TRAIN:,Recall TEST:,F1 score TRAIN,F1 score TEST,Accuracy TRAIN,Accurasy TEST
Random Forest,0.8821,0.890365,0.826613,0.833593,0.798971,0.8
SVC,0.913941,0.946844,0.852327,0.872894,0.827957,0.84486


## Partial fit

skmultiflow


In [126]:
pip install -U Cython


Collecting Cython
  Downloading Cython-3.0.5-cp39-cp39-win_amd64.whl (2.8 MB)
                                              0.0/2.8 MB ? eta -:--:--
     --------                                 0.6/2.8 MB 19.8 MB/s eta 0:00:01
     --------------------------               1.8/2.8 MB 23.1 MB/s eta 0:00:01
     ---------------------------------------  2.8/2.8 MB 25.4 MB/s eta 0:00:01
     ---------------------------------------- 2.8/2.8 MB 19.8 MB/s eta 0:00:00
Installing collected packages: Cython
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.35
    Uninstalling Cython-0.29.35:
      Successfully uninstalled Cython-0.29.35
Successfully installed Cython-3.0.5


In [135]:
from skmultiflow.core import Pipeline
import random

In [131]:
X, y = make_classification(n_features= 5, flip_y= 0.05, random_state= 42)

In [161]:
# random.sample(range(18, 71),1) for i in range(100)
[random.randint(1, 99) for _ in range(10)]

[48, 10, 79, 88, 32, 13, 30, 78, 53, 60]

In [166]:
dict = {'Age' : [random.randint(18, 71) for i in range(100)], 
        'cat1' :  [random.randint(1, 5) for i in range(100)], 
        'cat2' :  [random.randint(1, 3) for i in range(100)]}
X = pd.DataFrame(dict, columns=dict.keys())
y = [random.randint(0, 1) for i in range(100)]

In [199]:
X_new = X.copy()

In [228]:
scaler = StandardScaler()
encoder = OneHotEncoder()
# putting numeric columns to scaler and categorical to encoder
num_transformer_test = make_pipeline(scaler)
cat_transformer_test = make_pipeline(encoder)
# getting together our scaler and encoder with preprocessor
preprocessor_test = ColumnTransformer(transformers=[('num1', num_transformer_test, ['Age']), ('cat1', cat_transformer_test, ['cat1', 'cat2'])])
# preprocessor_test = ColumnTransformer(transformers=[('num1', num_transformer_test, ['Age'])])


In [229]:
est = SGDClassifier()
X_train, X_test, y_train, y_test = train_test_split(X,y)
pipe_test = make_pipeline(preprocessor_test, est)
pipe_test2 = Pipeline([('preproc',preprocessor_test), ('est',est)])
pipe_test3 = make_pipeline(

In [230]:
pipe_test2.fit(X_train, y_train)

In [231]:
y_pred = pipe_test.predict(X_test)
accuracy_score(y_test,y_pred)

0.68

In [232]:
def partial_pipe_fit(pipeline_obj, x, y):
    X = pipeline_obj.named_steps['preproc'].fit_transform(x)
    Y = y
    pipeline_obj.named_steps['est'].partial_fit(X,Y)

In [247]:
partial_pipe_fit(pipe_test2,X2,y)

In [248]:
pipe_test2.named_steps['est'].coef_

array([[-1.44373513e+01, -5.13083633e+00,  5.13083633e+00,
         5.13083633e+00,  1.60095413e-14, -8.42607439e-16,
         1.53925090e+01, -1.02616727e+01,  1.68521488e-14]])

In [242]:
dict = {'Age' : [random.randint(18, 71) for i in range(100)], 
        'cat1' :  [random.randint(1, 5) for i in range(100)], 
        'cat2' :  [random.randint(1, 3) for i in range(100)]}
X2 = pd.DataFrame(dict, columns=dict.keys())

In [250]:
# Imports
from skmultiflow.lazy import KNNADWINClassifier
from skmultiflow.core import Pipeline
from skmultiflow.data import FileStream
from skmultiflow.evaluation import EvaluatePrequential
from skmultiflow.transform import OneHotToCategorical
# Setting up the stream
stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/"
                    "streaming-datasets/master/covtype.csv")
transform = OneHotToCategorical([[10, 11, 12, 13],
[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])
# Setting up the classifier
classifier = KNNADWINClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40)
# Setup the pipeline
pipe = Pipeline([('transform', transform), ('passive_aggressive', classifier)])
# Setup the evaluator
evaluator = EvaluatePrequential(show_plot=False, pretrain_size=1000, max_samples=500000)
# Evaluate
evaluator.evaluate(stream=stream, model=pipe)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 1000 sample(s).
Evaluating...
 #------------------- [5%] [344.66s]
Processed samples: 43192
Mean performance:
M0 - Accuracy     : 0.8735
M0 - Kappa        : 0.8058


[Pipeline(steps=[('transform',
                  OneHotToCategorical(categorical_list=[[10, 11, 12, 13],
                                                        [14, 15, 16, 17, 18, 19,
                                                         20, 21, 22, 23, 24, 25,
                                                         26, 27, 28, 29, 30, 31,
                                                         32, 33, 34, 35, 36, 37,
                                                         38, 39, 40, 41, 42, 43, ...]])),
                 ('passive_aggressive',
                  KNNADWINClassifier(leaf_size=40, max_window_size=2000,
                                     metric='euclidean', n_neighbors=8))])]