<a href="https://colab.research.google.com/github/jkostic986-ui/product_classification/blob/main/03_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 3. Treniranje i evaluacija modela

In [5]:

from google.colab import files
uploaded = files.upload()

Saving products.csv to products.csv


In [6]:
# 1. Učitavanje biblioteka
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack, csr_matrix

# 2. Učitavanje CSV fajl
df = pd.read_csv("products.csv")  # proveri da li je fajl upload-ovan

# 3. Čišćenje naziva kolona
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# 4. Uklonanjanje redove bez naziva proizvoda ili kategorije
df = df.dropna(subset=["product_title", "category_label"])

# 5. Pravljenje nove kolone sa malim slovima
df["title_clean"] = df["product_title"].str.lower()

# 6. TF-IDF za tekstualni feature
tfidf = TfidfVectorizer(
    ngram_range=(1,2),  # unigrams + bigrams
    max_features=20000, # smanjeno radi brzine
    stop_words="english"
)
X_text = tfidf.fit_transform(df["title_clean"])
y = df["category_label"]

# 7. Numerički feature-i
df["title_len"] = df["title_clean"].str.len()
df["word_count"] = df["title_clean"].str.split().str.len()
df["has_number"] = df["title_clean"].str.contains(r"\d").astype(int)

X_num = csr_matrix(df[["title_len", "word_count", "has_number"]].values)

# 8. Kombinacija TF-IDF i numeričkih feature-a
X = hstack([X_text, X_num])

# 9. Podela na trening i test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Shape of X:", X.shape)
print("Training set:", X_train.shape[0], "samples")
print("Test set:", X_test.shape[0], "samples")



Shape of X: (35096, 20003)
Training set: 28076 samples
Test set: 7020 samples
