# Wczytanie i podstawowa eksploracja danych

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
from src.data_loader import load_data, basic_info

# Wczytanie danych
df = load_data('../data/SMSSpamCollection')

# Podstawowe informacje
basic_info(df)

# Przygotowanie danych do trenowania modelu

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
from src.data_loader import load_data
from src.preprocessing import preprocess_messages
from src.feature_extraction import vectorize_messages

# Wczytanie danych
df = load_data('../data/SMSSpamCollection')

# Preprocessing
df = preprocess_messages(df)

# Wektoryzacja
X, vectorizer = vectorize_messages(df["message"])

# Konwersja labeli: spam = 1, ham = 0
y = df["label"].apply(lambda x: 1 if x == "spam" else 0)

X.shape, y.shape

In [None]:
import nltk

nltk.download('punkt')

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Podział na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definicja modelu
nb = MultinomialNB()

# Parametry do GridSearch
param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}

# GridSearchCV - szukanie najlepszego alpha
grid = GridSearchCV(nb, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

# Najlepsze parametry
print("Najlepsze alpha:", grid.best_params_)

# Predykcje na zbiorze testowym
y_pred = grid.predict(X_test)

# Ewaluacja
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
import joblib

# Zapisz model
joblib.dump(grid.best_estimator_, '../models/spam_classifier_model.joblib')

# Zapisz vectorizer
joblib.dump(vectorizer, '../models/vectorizer.joblib')

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from src.predict import load_model_and_vectorizer, predict_message

# ladowanie modelu i vectorizer
model, vectorizer = load_model_and_vectorizer()

# Przykładowe predykcje
print(predict_message("Congratulations! You have won a free iPhone!", model, vectorizer))
print(predict_message("Hi, are we meeting at 5 pm today?", model, vectorizer))
print(predict_message("Win 1000$ cash now!", model, vectorizer))