<a href="https://colab.research.google.com/github/kibali-cell/ML-Projects/blob/main/ImprovedSSD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import json
import csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Your training code (simplified)
raw_mail_data = pd.read_csv('combined_set.csv')
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')
mail_data.loc[mail_data['label'] == 'spam', 'label'] = 0
mail_data.loc[mail_data['label'] == 'ham', 'label'] = 1
X = mail_data['Message']
Y = mail_data['label'].astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
with open('swahili_stopwords.csv', 'r', encoding='utf-8') as f:
    swahili_stop_words = [row[0] for row in csv.reader(f)]
feature_extraction = TfidfVectorizer(min_df=1, stop_words=swahili_stop_words, lowercase=True)
x_train_features = feature_extraction.fit_transform(X_train)
model = LogisticRegression()
model.fit(x_train_features, Y_train)

# Extract parameters
vocabulary = feature_extraction.vocabulary_  # e.g., {"karibu": 0, "pesa": 1, ...}
idf = feature_extraction.idf_.tolist()
coefficients = model.coef_[0].tolist()
intercept = float(model.intercept_[0])

# Save to JSON
with open('vocabulary.json', 'w') as f:
    json.dump(vocabulary, f)
with open('idf.json', 'w') as f:
    json.dump(idf, f)
with open('coefficients.json', 'w') as f:
    json.dump(coefficients, f)
with open('intercept.json', 'w') as f:
    json.dump({'intercept': intercept}, f)

