<a href="https://colab.research.google.com/github/jkostic986-ui/product_classification/blob/main/04_interactive_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 4. Testiranje novih proizvoda

In [1]:

from google.colab import files
uploaded = files.upload()

Saving products.csv to products.csv


In [11]:
import pandas as pd

# 1. Učitavanje CSV fajl
df = pd.read_csv("products.csv")

# 2. Očistiti kolone
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# 3. Ukloniti redove bez naziva proizvoda ili kategorije
df = df.dropna(subset=["product_title", "category_label"])

# 4. Napraviti kolonu sa "očišćenim" naslovima
df["title_clean"] = df["product_title"].str.lower()


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import joblib

# Pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        max_features=20000,
        stop_words="english"
    )),
    ("clf", LogisticRegression(max_iter=2000, solver="saga"))
])

# Fit na celokupnim podacima
pipeline.fit(df["title_clean"], df["category_label"])

# Sačuvati model
joblib.dump(pipeline, "product_category_model.pkl")
print("Model sačuvan kao product_category_model.pkl")


Model sačuvan kao product_category_model.pkl


In [14]:
# 1. Biblioteke
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
from scipy.sparse import hstack, csr_matrix

# 2. Učitavanje podataka
df = pd.read_csv("products.csv")
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# 3. Čišćenje podataka
df = df.dropna(subset=["product_title", "category_label"])
df["title_clean"] = df["product_title"].str.lower()

# 4. Numerički feature-i
df["title_len"] = df["title_clean"].str.len()
df["word_count"] = df["title_clean"].str.split().str.len()
df["has_number"] = df["title_clean"].str.contains(r"\d").astype(int)

X_num = csr_matrix(df[["title_len", "word_count", "has_number"]].values)
y = df["category_label"]

# 5. TF-IDF
tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=20000,
    stop_words="english"
)
X_text = tfidf.fit_transform(df["title_clean"])

# 6. Kombinacija TF-IDF + numerički feature-i
X = hstack([X_text, X_num])

# 7. Podela na trening i test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 8. Treniranje modela
lr = LogisticRegression(max_iter=2000, solver="saga")
lr.fit(X_train, y_train)

# 9. Evaluacija
y_pred = lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 10. Čuvanje modela i TF-IDF
joblib.dump(lr, "product_category_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("Model i TF-IDF sačuvani kao .pkl fajlovi")

# 11. Interaktivna predikcija
def predict_category(title):
    title_clean = title.lower()
    X_text_new = tfidf.transform([title_clean])

    # Numerički feature-i
    title_len = len(title_clean)
    word_count = len(title_clean.split())
    has_number = int(any(char.isdigit() for char in title_clean))

    X_num_new = csr_matrix([[title_len, word_count, has_number]])

    # Kombinacija
    X_input = hstack([X_text_new, X_num_new])

    pred = lr.predict(X_input)[0]
    return pred

# Primer
while True:
    user_input = input("Unesi naziv proizvoda (ili 'exit' za kraj): ")
    if user_input.lower() == "exit":
        break
    category = predict_category(user_input)
    print(f"Predložena kategorija: {category}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9253561253561253
                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.97      0.99      0.98       749
 Digital Cameras       0.98      0.96      0.97       538
     Dishwashers       0.88      0.94      0.91       681
        Freezers       0.99      0.82      0.90       440
 Fridge Freezers       0.92      0.91      0.91      1094
         Fridges       0.76      0.89      0.82       687
      Microwaves       0.99      0.92      0.95       466
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.92      0.98      0.95       801
             TVs       0.97      0.96      0.96       708
Washing Machines       0.96      0.92      0.94       803
          fridge       0.00      0.00      0.00        25

        accuracy                           0.93      7020
       macro avg       0.72      0.71      0.72      7020
    weighted avg       0.92      0.93    