In [None]:
import numpy as np
import pandas as pd

In [None]:
# https://www.statsmodels.org/stable/index.html
import statsmodels.api as sm

In [None]:
spamDf = pd.read_excel("https://www.dropbox.com/scl/fi/v24mmhg5hmefmnv99uqsy/Spam.xlsx?rlkey=iq7exnueq84sy7y2b8ud70mp0&dl=1")
spamDf

In [None]:
spamDf.size, spamDf.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split the dataset into training and testing sets
trainSet, testSet = train_test_split(
  spamDf,
  test_size=0.3,
  random_state=555,
  stratify=spamDf['Spam']
)
trainSet.shape, testSet.shape

In [None]:
# Fit the logistic regression model
features = ['Recipients', 'Hyperlinks', 'Characters']
xTrain = trainSet[features]
yTrain = trainSet['Spam'].astype(int)

In [None]:
spamBasedOnRecipientsHyperlinksCharactersLogitModel = sm.Logit(
  yTrain,
  transformer(xTrain)
)
spamBasedOnRecipientsHyperlinksCharactersLogitModelFit = spamBasedOnRecipientsHyperlinksCharactersLogitModel.fit()
print(spamBasedOnRecipientsHyperlinksCharactersLogitModelFit.summary())

In [None]:
predict1 = spamBasedOnRecipientsHyperlinksCharactersLogitModelFit.predict(sm.add_constant(testSet[features]))
testSet['predict1'] = predict1
sumTable = pd.DataFrame({'A': testSet['Spam'], 'Prob': testSet['predict1']})
sumTable.to_csv("ROC.csv", index=True)


In [None]:
sumTable1 = pd.DataFrame({'A': testSet['Spam'], 'Prob': testSet['predict1']})

In [None]:
# Make predictions based on probability threshold of 0.5
testSet['predictions'] = (testSet['predict1'] > 0.5).astype(int)
sumTable1['P'] = testSet['predictions']
sumTable1

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve

In [None]:
# Calculate accuracy
accuracy = accuracy_score(sumTable1['A'], sumTable1['P'])
print(f'Accuracy: {accuracy}')

In [None]:
# Calculate recall
recall = recall_score(sumTable1['A'], sumTable1['P'])
print(f'Recall: {recall}')

In [None]:
# Calculate precision
precision = precision_score(sumTable1['A'], sumTable1['P'])
print(f'Precision: {precision}')

In [None]:
# Sensitivity and Specificity (Sensitivity is same as recall)
sensitivity = recall
specificity = sum((sumTable1['A'] == 0) & (sumTable1['P'] == 0)) / sum(sumTable1['A'] == 0)
print(f'Sensitivity: {sensitivity}')
print(f'Secificity: {specificity}')

In [None]:
# Calculate F1 Score
f1Score = 2 * (precision * recall) / (precision + recall)
print(f'F1 Score: {f1Score}')

In [None]:
# Plot ROC curve
fpr, tpr, _ = roc_curve(testSet['Spam'], testSet['predict1'])
roc_auc = roc_auc_score(testSet['Spam'], testSet['predict1'])
# Calculate AUC
print(f'AUC: {roc_auc}')


In [None]:
import matplotlib.pyplot as plt

In [None]:

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
# KFold
from sklearn.model_selection import KFold
# Initialize KFold
# k=2
# k=5
k=10
kf = KFold(n_splits=k, shuffle=True, random_state=555)
spamDf.head()

In [None]:
check = kf.split(spamDf)
check
experiment = 1
# Loop through each fold
# Initialize variables to store results
accuracies = []

for train_index, val_index in check:
    # Split the data
    trainSet, valSet = spamDf.iloc[train_index], spamDf.iloc[val_index]

    # Fit the model

    trainModel = sm.Logit(
      trainSet["Spam"],
      sm.add_constant(trainSet[['Recipients', 'Hyperlinks', 'Characters']])
    )
    trainModelFit = trainModel.fit()

    # Predict on the validation set
    val_predictions = trainModelFit.predict(sm.add_constant(valSet[['Recipients', 'Hyperlinks', 'Characters']]))
    valSet['val_predictions'] = val_predictions
    valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)
    valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['Spam'] == row['yHatCross'] else 0, axis=1)
    accuracy = (np.sum(valSet['isCrossCorrect']) / len(valSet['yHatCross'])) * 100
    accuracies.append(accuracy)


    # Print summary for each fold (optional)
    print(f'expr={experiment}')
    experiment = experiment +1
    print(trainModelFit.summary())

In [None]:
accuracies, print(f"Average accuracies across all folds: {sum(accuracies) /len(accuracies)}")

## Expoprt model

In [None]:
from mlModelSaver import MlModelSaver
mlModelSaverInstance = MlModelSaver({
    "baseRelativePath": "../..",
    "modelsFolder": "models"
})

loadedModel = mlModelSaverInstance.exportModel(
    spamBasedOnRecipientsHyperlinksCharactersLogitModelFit,
    {
        "modelName": "spamBasedOnRecipientsHyperlinksCharactersLogitModelFit",
        "description": "spamBasedOnRecipientsHyperlinksCharactersLogitModelFit",
        "modelType": "sm.Logit",
        "inputs": [
            {
                "name": "Recipients",
                "type": "int",
            },
            {
                "name": "Hyperlinks",
                "type": "int"
            },
            
            {
                "name": "Characters",
                "type": "int"
            }
        ],
        "transformer": transformer,
        "outputs": [
            {
                "name": "Spam",
                "type": "probebility"
            }
        ]
    }
)
loadedModel

In [None]:
testSet['predict2'] = spamBasedOnRecipientsHyperlinksCharactersLogitModelFit.predict(transformer(testSet[features]))
testSet['predict3'] = spamBasedOnRecipientsHyperlinksCharactersLogitModelFit.predict(sm.add_constant(testSet[features]))

In [None]:
testSet.head(5)

In [None]:
testData = [
    {
        "Recipients": 12,
        "Hyperlinks": 8,
        "Characters": 33
    },
    {
        "Recipients": 12,
        "Hyperlinks": 10,
        "Characters": 92
    },
    {
        "Recipients": 13,
        "Hyperlinks": 10,
        "Characters": 71
    },
]

# Create a DataFrame from the dictionary
testDf = pd.DataFrame(testData)

In [None]:
spamBasedOnRecipientsHyperlinksCharactersLogitModelFit.predict( transformer(testDf))

In [None]:
loadedModel.mlModelSavePredict(testDf)