# Objectives
YWBAT
* explain the different parts of the sigmoid curve
* explain how thresholds affect model outcomes/scores
* explain how confusion matrices are helpful


# Outline
* load in some data
* do some feature engineering/eda
* build a logreg model, score it, confusion matrix it
* make this into a workflow
* discuss stuff
* tune logistic regression hyperparameters

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import confusion_matrix

import plotly
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../data/SPAM text message 20170820 - Data.csv")
# rename columns to lowercase
df.columns = df.columns.str.lower()
df.head()

# Let's make some new features on this dataset

## make a column that contains an integer of the word count of the message column
Store it to a column called `wordcount`

In [None]:
# code here
def count_words(message):
    """
    input
    message: str, some text message
    
    return
    word_count, int, number of words in message
    """
    word_count = len(message.split(" "))
    return word_count

In [None]:
df['word_count'] = df['message'].apply(count_words)
df.head()

## remove the following words from each message (upper and lower case versions)
Store the cleaned message to a column called `cleaned message`

In [None]:
stop_words = ['the', 'a', 'as', 'be', 'that', 'this']

# code here

def clean_message(message, stop_words=stop_words):
    """
    input
    message: str, some text message
    
    return
    cleaned_message: str, message with specific words to remove
    """
    words = message.split(" ")
    clean_message = len([word for word in words if word.lower() not in stop_words])
    return clean_message

## take a wordcount of the cleaned message
Store this to a column called `cleaned_wordcount`

In [None]:
# code here
df['cleaned_wordcount'] = df['message'].apply(clean_message)
df.head()

In [None]:
df['contains_stopword'] = df['word_count'] != df['cleaned_wordcount']
df.head()

## plot a bar chart of the word count by spam/ham
Use whichever plotting tool you want

In [None]:
# code here
df_grouped_by_word_count = df.groupby('category')[['word_count']].agg(np.mean).reset_index()

fig = px.bar(data_frame=df_grouped_by_word_count,
             x='category',
             y = 'word_count')
fig.show()

## plot a bar chart of the cleaned_wordcount by spam/ham
Use whichever plotting tool you want

In [None]:
# code here
# code here
df_grouped_by_cleaned_wordcount = df.groupby('category')[['cleaned_wordcount']].agg(np.mean).reset_index()

fig = px.bar(data_frame=df_grouped_by_cleaned_wordcount,
             x='category',
             y = 'cleaned_wordcount')
fig.show()

In [None]:
df['word_count_z'] = (df['word_count'] - df['word_count'].mean()) / df['word_count'].std()
df.head()

## Create a new column called `target` 
where target = 1 if category = spam else target = 0


In [None]:
# code here
df['target'] = df['category'].apply(lambda x: 1 if x.lower().strip(" ")=='spam' else 0)
df.head()

# Logistic Regression Time

## Plot a scatter plot of word count vs target
Use whichever plotting tool you want

In [None]:
# code here
fig = px.scatter(data_frame=df,
                 x='word_count',
                 y = 'target')
fig.show()

In [None]:
fig = px.violin(data_frame=df,
                x='category',
                y='word_count')
fig.show()

### what do you notice in the plot?

# Logistic regression would be horrible. There's 100% overlap of our SPAM values on our HAM values.

## Build a Logistic Regression in sklearn using a train/test split of 0.8/0.2
X should contain only word_count data

y should be the target column

In [None]:
X = df['word_count_z']
y = df['target']

X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=0.20)

X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

In [None]:
# code here
logreg = LogisticRegression(fit_intercept=False)
logreg.fit(X_train, y_train)

# score your model
score it on both the training and test set

In [None]:
# code here
logreg.score(X_train, y_train), logreg.score(X_test, y_test)

## Look at your model's beta coefficients

In [None]:
# code here
logreg.coef_, logreg.intercept_

## plot a confusion matrix and evaluate it
Make sure you label true/predicted axes

In your evaluation mention the False Positive Rate and False Negative Rate

In [None]:
# code here
y_test_pred = logreg.predict(X_test)
cm = confusion_matrix(y_test, y_test_pred)

sns.heatmap(cm, annot=True, fmt='0.4g', cmap=sns.color_palette('Blues'))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

Evaluation Here:

This model sucks, it predicts everything as HAM

# Which texts were mislabeled?

In [None]:
y_test_probs = logreg.predict_proba(X_test)[:, 1]
y_test_probs

In [None]:
index_of_mislabeled = np.where(y_test_probs > 0.5)
index_of_mislabeled

In [None]:
word_count_of_mislabeled = X_test[index_of_mislabeled]
word_count_of_mislabeled

In [None]:
X.values.reshape(-1, 1)

In [None]:
df['sigmoid_values'] = logreg.predict_proba(X.values.reshape(-1, 1))[:, 0]


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df['word_count_z'], y=df['target'],
    name='Actual',
    mode='markers',
    marker_color='rgba(152, 0, 0, .8)'
))

fig.add_trace(go.Scatter(
    x=df['word_count_z'], y=df['sigmoid_values'],
    name='Predicted',
    mode='markers',
    marker_color='rgba(255, 182, 193, .9)'
))

fig.show()

# What did we learn?
* Plotting 2 scatter plots on the same figure in plotly
* `.predict_prob()` method in sklearn
* Why `y` is between 0 and 1 in a sigmoid function
* Preview into working with text data 
* `.values` on a pandas series
* importance of confusion matrix in determining model quality
* don't chase good score, check model with confusion matrix, roc/auc curve
* reshape
* Use seaborn heatmap for confusion matrix plotting
* elementary version of creating numerical features from text data
* you have to turn text data into numerical data for modeling