In [1]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model

### Define a function to extract predictions from the model

In [2]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [3]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [4]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [5]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [6]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [7]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [8]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 359,
 'tn': 2637,
 'fp': 0,
 'fn': 4,
 'misclassification_rate': 0.0013333333333333333,
 'false_positive_rate': 0.0,
 'false_negative_rate': 0.011019283746556474}

### Check results on the test set (new data not yet seen by the model)

In [9]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 22,
 'tn': 860,
 'fp': 14,
 'fn': 104,
 'misclassification_rate': 0.118,
 'false_positive_rate': 0.016018306636155607,
 'false_negative_rate': 0.8253968253968254}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

Seeing as my model has a 11.8% classification rate, I am not confident in it's ability to discern between users and bots. If this model were applied to a platform of 10,000 accounts, roughly 1,000 accounts would either be falsely flagged or undetected bots, which could lead to a very poor user experience for actual users.

### What are potential ramifications of false positives from the model?

Considering the rate of false positives is 1.6%, my model is very unlikey to falsely flag a user as a bot. So, appliying this model to a platform would relatively leave users unbothered.

### What are potential ramifications of false negatives from the model?

Considering the rate of flase positives is 82.5%, my model is extremely likely to not flag a bot. So, applying this model to a platform would allow for bot accounts to flood the service. This could then lead to a generally worse user experience.