<a href="https://colab.research.google.com/github/jkostic986-ui/product_classification/blob/main/product_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Automatska klasifikacija proizvoda po kategorijama

In [1]:
# Učitavamoo potrebne biblioteke
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [2]:
from google.colab import files
uploaded = files.upload()


Saving products.csv to products (1).csv


## 1. Analiza i priprema podataka (u notebooku)

- Učitavanje podataka
- Proveravamo nazive kolona i uklanjamo nepotrebne razmake
df = pd.read_csv("products.csv")
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.head()


In [3]:
import pandas as pd

df = pd.read_csv("products.csv")
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.head()


Unnamed: 0,product_id,product_title,merchant_id,category_label,_product_code,number_of_views,merchant_rating,listing_date
0,1,apple iphone 8 plus 64gb silver,1,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024
1,2,apple iphone 8 plus 64 gb spacegrau,2,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024
3,4,apple iphone 8 plus 64gb space grey,4,Mobile Phones,YI-0086-US,466.0,3.4,5/2/2022
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,Mobile Phones,NZ-3586-WP,4426.0,1.6,4/12/2023


## 2. Čišćenje podataka

- Uklanjamo redove koji nemaju naziv proizvoda ili kategoriju
- Kreiramo novu kolonu sa "očišćenim" naslovima (sva mala slova)

In [4]:
df = df.dropna(subset=["product_title", "category_label"])

df["title_clean"] = df["product_title"].str.lower()
df.head()


Unnamed: 0,product_id,product_title,merchant_id,category_label,_product_code,number_of_views,merchant_rating,listing_date,title_clean
0,1,apple iphone 8 plus 64gb silver,1,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024,apple iphone 8 plus 64gb silver
1,2,apple iphone 8 plus 64 gb spacegrau,2,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024,apple iphone 8 plus 64 gb spacegrau
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...
3,4,apple iphone 8 plus 64gb space grey,4,Mobile Phones,YI-0086-US,466.0,3.4,5/2/2022,apple iphone 8 plus 64gb space grey
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,Mobile Phones,NZ-3586-WP,4426.0,1.6,4/12/2023,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...


## 3. Feature engineering

- TF-IDF za tekstualni feature
- Dodajemo numeričke feature-e koji mogu poboljšati model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=50000,
    stop_words="english"
)
X_text = tfidf.fit_transform(df["title_clean"])
y = df["category_label"]


df["title_len"] = df["title_clean"].str.len()
df["word_count"] = df["title_clean"].str.split().str.len()
df["has_number"] = df["title_clean"].str.contains(r"\d").astype(int)

X_num = df[["title_len", "word_count", "has_number"]].values
X_num_sparse = csr_matrix(X_num)

X = hstack([X_text, X_num_sparse])
print("Shape of X:", X.shape)



## 4. Podela na trening i test set

- Prikaz broja primera u svakom setu

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,                  # svi feature-i (TF-IDF + numerički)
    y,                  # ciljna promenljiva (kategorija)
    test_size=0.2,      # 20% podataka ide u test set
    random_state=42,    # za reproduktivnost
    stratify=y          # čuvamo distribuciju kategorija
)

print("Broj trening primera:", X_train.shape[0])
print("Broj test primera:", X_test.shape[0])


Broj trening primera: 28076
Broj test primera: 7020


## 5. Treniranje modela

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

tfidf = TfidfVectorizer(
    ngram_range=(1,1),
    max_features=20000,
    stop_words="english"
)
X_text = tfidf.fit_transform(df["title_clean"])  # fit + transform

# Numerički feature-i
df["title_len"] = df["title_clean"].str.len()
df["word_count"] = df["title_clean"].str.split().str.len()
df["has_number"] = df["title_clean"].str.contains(r"\d").astype(int)

X_num = df[["title_len", "word_count", "has_number"]].values
X_num_sparse = csr_matrix(X_num)

# Kombinacija
X = hstack([X_text, X_num_sparse])

# Prikaz
print("Shape of X:", X.shape)
print("Prvih 5 redova X_num:", X_num[:5])




Shape of X: (35096, 19260)
Prvih 5 redova X_num: [[31  6  1]
 [35  7  1]
 [70 13  1]
 [35  7  1]
 [54 11  1]]


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr = LogisticRegression(max_iter=2000, solver='saga')
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9132478632478632
                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.97      0.98      0.98       749
 Digital Cameras       0.98      0.95      0.96       538
     Dishwashers       0.84      0.92      0.88       681
        Freezers       0.99      0.81      0.89       440
 Fridge Freezers       0.93      0.89      0.91      1094
         Fridges       0.71      0.89      0.79       687
      Microwaves       0.99      0.90      0.94       466
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.90      0.97      0.94       801
             TVs       0.97      0.95      0.96       708
Washing Machines       0.99      0.90      0.94       803
          fridge       0.00      0.00      0.00        25

        accuracy                           0.91      7020
       macro avg       0.71      0.71      0.71      7020
    weighted avg       0.91      0.91    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 6. Evaluacija modela

In [8]:
y_pred = lr.predict(X_test)
print("Tačnost modela:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Tačnost modela: 0.9371794871794872
                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.98      0.99      0.98       749
 Digital Cameras       0.99      0.97      0.98       538
     Dishwashers       0.91      0.91      0.91       681
        Freezers       0.99      0.88      0.93       440
 Fridge Freezers       0.85      0.96      0.90      1094
         Fridges       0.88      0.86      0.87       687
      Microwaves       0.99      0.95      0.97       466
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.95      0.99      0.97       801
             TVs       0.97      0.98      0.98       708
Washing Machines       0.98      0.92      0.95       803
          fridge       0.00      0.00      0.00        25

        accuracy                           0.94      7020
       macro avg       0.73      0.72      0.73      7020
    weighted avg       0.93      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 7. Čuvanje modela za kasniju upotrebu

In [9]:
pipeline = Pipeline([
    ("tfidf", tfidf),
    ("clf", lr)
])
pipeline.fit(df["title_clean"], df["category_label"])
joblib.dump(pipeline, "product_category_model.pkl")
print("Model sačuvan kao product_category_model.pkl")

Model sačuvan kao product_category_model.pkl


In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib


pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        max_features=50000,
        stop_words="english"
    )),
    ("clf", LogisticRegression(max_iter=2000, solver='saga'))
])


pipeline.fit(df["title_clean"], df["category_label"])





## 8. Interaktivna predikcija novih proizvoda

- Ovo može biti sadržaj predict_category.py skripta

In [22]:
import joblib


joblib.dump(lr, "product_category_model.pkl")


joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("Model i TF-IDF vektorizator su sačuvani.")


Model i TF-IDF vektorizator su sačuvani.


In [23]:
!ls


 product_category_model.pkl   products.csv   tfidf_vectorizer.pkl
'products (1).csv'	      sample_data


In [33]:
joblib.dump(pipeline, "product_category_model.pkl")
print("Model sačuvan kao product_category_model.pkl")

# 5. Test predikcija
test_titles = [
    "iphone 7 32gb gold",
    "kenwood k20mss15 solo",
    "smeg sbs8004po"
]

for t in test_titles:
    pred = pipeline.predict([t])
    print(f"{t} → {pred[0]}")

Model sačuvan kao product_category_model.pkl
iphone 7 32gb gold → Mobile Phones
kenwood k20mss15 solo → Microwaves
smeg sbs8004po → Fridges
