# **Machine Learning Pipeline**

In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier

import joblib


In [97]:
d1 = {
    'Social_media_followers' : [1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
    'Sold_out': [1,0,0,1,0,0,0,1,0,1]
}

d2 = {
        'Genre':['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 'Rock', np.nan, 'Bluegrass', 'Rock'],
        'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
        'Sold_out':[1,0,0,1,0,0,0,1,0,1]
    }

df1 = pd.DataFrame(d1)
df1

Unnamed: 0,Social_media_followers,Sold_out
0,1000000.0,1
1,,0
2,2000000.0,0
3,1310000.0,1
4,1700000.0,0
5,,0
6,4100000.0,0
7,1600000.0,1
8,2200000.0,0
9,1000000.0,1


In [98]:
X1 = df1[['Social_media_followers']]
y1 = df1[['Sold_out']]

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=19)

imputer = SimpleImputer(strategy='mean')
lr = LogisticRegression()

pipe1 = make_pipeline(imputer, lr)

pipe1.fit(X1_train, y1_train)

  y = column_or_1d(y, warn=True)


In [99]:
pipe1.score(X1_train, y1_train)

1.0

In [100]:
pipe1.score(X1_test, y1_test)

0.6666666666666666

In [101]:
pipe1.named_steps.simpleimputer.statistics_

array([2051666.66666667])

In [102]:
pipe1.named_steps.logisticregression.coef_

array([[-9.72872687e-05]])

### More Advance Pipeline

In [103]:
df = pd.DataFrame(data=d2)
df

Unnamed: 0,Genre,Social_media_followers,Sold_out
0,Rock,1000000.0,1
1,Metal,,0
2,Bluegrass,2000000.0,0
3,Rock,1310000.0,1
4,,1700000.0,0
5,Rock,,0
6,Rock,4100000.0,0
7,,1600000.0,1
8,Bluegrass,2200000.0,0
9,Rock,1000000.0,1


In [104]:
X = df.iloc[:,0:2]
y = df.iloc[:,2]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=17)

num_cols = ["Social_media_followers"]
cat_cols = ['Genre']

num_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler())
    ]
)

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
cat_pipeline

In [105]:
col_transformer = ColumnTransformer(transformers= [
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols),
],
remainder='drop', n_jobs=-1
)


In [106]:
dtc = DecisionTreeClassifier()
pipefinal = make_pipeline(col_transformer, dtc)
pipefinal.fit(X_train, y_train)

In [107]:
pipefinal.score(X_test, y_test)

0.6666666666666666

## How to save your pipeline

In [108]:
joblib.dump(pipefinal, 'pipe.joblib')

['pipe.joblib']

In [109]:
pipefinal2 = joblib.load('pipe.joblib')
pipefinal2