In [20]:
# 📘 02_model_dev.ipynb – Model Development

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pickle

# 📥 Load data
df = pd.read_csv('../data/raw/train.csv')

# 🔧 Preprocess
df['text'] = df[['title', 'description', 'requirements', 'benefits']].astype(str).agg(" ".join, axis=1)
df = df[['text', 'fraudulent']].dropna()

X = df['text']
y = df['fraudulent']

# 📐 TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_vec = tfidf.fit_transform(X)

# 📊 Handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_vec, y)

# 🔀 Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res, random_state=42)

# 🤖 Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 🔍 Evaluate
y_pred = clf.predict(X_val)
print("F1 Score:", f1_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

# 💾 Save model and vectorizer
with open('../models/best_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

with open('../models/vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)


FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/train.csv'