In [28]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.metrics import classification_report
import pandas as pd
np.random.seed(0)

## some made and dummy data

In [29]:
data = [
        ["macbook pro", "Apple Macbook Pro 13 με Touch Bar MWP52GR/A (2020) Space Grey Laptop (Core i5/16 GB/1 TB/Iris Plus Graphics). 2.749,00", 2749, 100, 1, 0 ],
        ["Apple iPhone 11 (64GB) Black","Διπλή κάμερα 12MP προσφέρουν πληθώρα δυνατοτήτων άκομα και σε συνθήκες χαμηλού φωτισμού, ενώ η selfie 12MP υποστήριζει βίντεο 4Κ και slow motion.", 1000, 123, None, 1],
        ["Huawei P40 Lite (128GB) Midnight Black",  "Οθόνη: IPS 6.4, RAM: 6GB, Κάμερα: 48MP + 8MP + 2MP, Μπαταρία: 4200mAh", 193, 45,1, 0],
        ["Huawei P40 Lite (128GB) Crush Green", "Χωρίς Google Mobile Services. SuperCharge 40W για επαναφόρτιση της μπαταρίας μέχρι το 70% σε 30 μόλις λεπτά αλλά και τετραπλή κάμερα ΑΙ 48M", 200, 23, 3, 1],
        ["Samsung Galaxy A71 (128GB) Prism Crush Black Samsung Galaxy A71 (128GB) Prism Crush Black"," Τετραπλή κάμερα 64MP με ευρυγώνιο, macro και βάθους φακό. Game Booster για mobile gaming. Βίντεο 4K με λειτουργίες και φίλτρα.", None, 250,1, 1]
]

In [30]:
df = pd.DataFrame( data = data, columns = ["product_name", "descr", "price", "distance", "cat", "label"])
df.head()

Unnamed: 0,product_name,descr,price,distance,cat,label
0,macbook pro,Apple Macbook Pro 13 με Touch Bar MWP52GR/A (2...,2749.0,100,1.0,0
1,Apple iPhone 11 (64GB) Black,Διπλή κάμερα 12MP προσφέρουν πληθώρα δυνατοτήτ...,1000.0,123,,1
2,Huawei P40 Lite (128GB) Midnight Black,"Οθόνη: IPS 6.4, RAM: 6GB, Κάμερα: 48MP + 8MP +...",193.0,45,1.0,0
3,Huawei P40 Lite (128GB) Crush Green,Χωρίς Google Mobile Services. SuperCharge 40W ...,200.0,23,3.0,1
4,Samsung Galaxy A71 (128GB) Prism Crush Black S...,"Τετραπλή κάμερα 64MP με ευρυγώνιο, macro και ...",,250,1.0,1


In [31]:
labels = df.label.copy()

df.drop( axis=1, columns=["label"],inplace=True)

df.head()

Unnamed: 0,product_name,descr,price,distance,cat
0,macbook pro,Apple Macbook Pro 13 με Touch Bar MWP52GR/A (2...,2749.0,100,1.0
1,Apple iPhone 11 (64GB) Black,Διπλή κάμερα 12MP προσφέρουν πληθώρα δυνατοτήτ...,1000.0,123,
2,Huawei P40 Lite (128GB) Midnight Black,"Οθόνη: IPS 6.4, RAM: 6GB, Κάμερα: 48MP + 8MP +...",193.0,45,1.0
3,Huawei P40 Lite (128GB) Crush Green,Χωρίς Google Mobile Services. SuperCharge 40W ...,200.0,23,3.0
4,Samsung Galaxy A71 (128GB) Prism Crush Black S...,"Τετραπλή κάμερα 64MP με ευρυγώνιο, macro και ...",,250,1.0


In [19]:
numeric_features = ['price', 'distance']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])



categorical_features = ['cat']  
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


text_transformer_1 = Pipeline( steps=[
    ('pcountvectorizer', CountVectorizer()),
])


text_transformer_2 = Pipeline( steps=[
                    
    ('desctfidf', TfidfVectorizer()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('textvec1', text_transformer_1, "product_name"),
        ('textvec2', text_transformer_2, "descr")
    ])


In [20]:
## this is the feature transformation pipeline

In [21]:
ret = preprocessor.fit_transform(df)
preprocessor.named_transformers_

{'num': Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())]),
 'cat': Pipeline(steps=[('imputer', SimpleImputer(fill_value=-1, strategy='constant')),
                 ('onehot', OneHotEncoder(handle_unknown='ignore'))]),
 'textvec1': Pipeline(steps=[('pcountvectorizer', CountVectorizer())]),
 'textvec2': Pipeline(steps=[('desctfidf', TfidfVectorizer())])}

In [22]:
ret = preprocessor.fit_transform(df)
mat = ret.todense()
mat.shape

(5, 104)

In [24]:
# combine the feature engineering pipeline with a classifier

In [25]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

clf.fit(df, labels)
print("model score: %.3f" % clf.score(df, labels))

model score: 1.000


In [26]:
y_pred = clf.predict(df)
print('Classification report:\n\n{}'.format(
    classification_report(y_pred, labels))
)

Classification report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         3

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

