In [217]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [218]:
data = {'Genre':['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 'Rock', np.nan, 'Bluegrass', 'Rock'],
'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
'Sold_out':[1,0,0,1,0,0,0,1,0,1]}

In [219]:
df = pd.DataFrame(data)
df

Unnamed: 0,Genre,Social_media_followers,Sold_out
0,Rock,1000000.0,1
1,Metal,,0
2,Bluegrass,2000000.0,0
3,Rock,1310000.0,1
4,,1700000.0,0
5,Rock,,0
6,Rock,4100000.0,0
7,,1600000.0,1
8,Bluegrass,2200000.0,0
9,Rock,1000000.0,1


In [220]:
X = df.drop(['Sold_out'], axis=1)
y = df[['Sold_out']]

In [221]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [222]:
num_cols = ['Social_media_followers']

In [223]:
cat_cols = ['Genre']

In [224]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',StandardScaler())
])

In [225]:
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [226]:
col_trans = ColumnTransformer(transformers=[
('num_pipeline',num_pipeline,num_cols),
('cat_pipeline',cat_pipeline,cat_cols)
],
remainder='drop',
n_jobs=-1)

In [227]:
dtc = DecisionTreeClassifier()

In [228]:
pipefinal = make_pipeline(col_trans, dtc)

In [229]:
pipefinal.fit(X_train, y_train)

In [230]:
pipefinal.score(X_test, y_test)

1.0

In [231]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [234]:
print(confusion_matrix(y_test, y_pred, labels=[0,1]))

[[2 0]
 [0 0]]


In [235]:
#create pipeline file
import joblib
joblib.dump(pipefinal, 'pipe.joblib')

['pipe.joblib']

In [236]:
#call the pipeline
pipefinal2 = joblib.load('pipe.joblib')