In [None]:
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. Load Data
try:
    df = pd.read_csv("ticket_data.csv")
except FileNotFoundError:
    print("Error: Run generate_dataset.py first!")
    exit()

# 2. Preprocessing Function
def clean_text(text):
    text = text.lower() # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove punctuation
    return text

df['clean_text'] = df['text'].apply(clean_text)

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'],
    df['category'],
    test_size=0.2,
    random_state=42
)

# 4. Build Pipeline
# We use TF-IDF (Term Frequency-Inverse Document Frequency) to convert text to numbers
# We use Logistic Regression as the classifier
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(random_state=42))
])

# 5. Train Model
print("Training model...")
model_pipeline.fit(X_train, y_train)

# 6. Evaluate
print("\nModel Performance:")
predictions = model_pipeline.predict(X_test)
print(classification_report(y_test, predictions))

# 7. Save the pipeline
with open("ticket_classifier.pkl", "wb") as f:
    pickle.dump(model_pipeline, f)

print("âœ… Model saved as ticket_classifier.pkl")