In [43]:
# !pip install gradio

In [44]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

# Import Gradio
import gradio as gr

In [45]:
# Create method to oversample data to improve balance with the minority_class data
def manual_oversample(df, target_column):
    """
    Oversamples the minority class in a DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the data.
    target_column (str): The name of the target column containing class labels.

    Returns:
    pd.DataFrame: A new DataFrame with balanced classes.
    """
    majority_class = df[df[target_column] == 'ham']
    minority_class = df[df[target_column] == 'spam']

    # Oversample the minority class
    minority_oversampled = minority_class.sample(len(majority_class), replace=True)

    # Combine the majority class with the oversampled minority class
    df_balanced = pd.concat([majority_class, minority_oversampled])

    # Shuffle the resulting DataFrame
    df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

    return df_balanced

In [46]:
def sms_classification(sms_text_df):
    """
    Perform SMS classification using a pipeline with TF-IDF vectorization and Linear Support Vector Classification.

    Parameters:
    - sms_text_df (pd.DataFrame): DataFrame containing 'text_message' and 'label' columns for SMS classification.

    Returns:
    - text_clf (Pipeline): Fitted pipeline model for SMS classification.

    This function takes a DataFrame with 'text_message' and 'label' columns, splits the data into
    training and testing sets, builds a pipeline with TF-IDF vectorization and Linear Support Vector
    Classification, and fits the model to the training data.
    The fitted pipeline is returned to make future predictions.
    """

    # Set the features variable to the text message column.
    X = sms_text_df['text_message']
    # Set the target variable to the "label" column.
    y = sms_text_df['label']

    # Split data into training and testing and set the test_size = 33%
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    # Build a pipeline to transform the test set to compare to the training set.
    text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                         ('clf', LinearSVC())])

    # Fit the model to the transformed training data and return model.
    text_clf.fit(X_train, y_train)

    return text_clf

In [47]:
# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('SMSSpamCollection.csv')
sms_text_df.columns = ['label', 'text_message']  # Rename columns for clarity
sms_text_df.tail()

Unnamed: 0,label,text_message
5567,spam,"This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other suggestions?"
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free
5571,ham,Rofl. Its true to its name


In [48]:
  # Use oversample method to balance the data and improve model output; data was imbalanced
  sms_text_df = manual_oversample(sms_text_df, 'label')
  display(sms_text_df['label'] .value_counts())

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
spam,4825
ham,4825


In [49]:
# Call the sms_classification function with the DataFrame and set the result to the "text_clf" variable
text_clf = sms_classification(sms_text_df)

In [50]:
# Create a function called `sms_prediction` that takes in the SMS text and predicts the whether the text is "not spam" or "spam".
# The function should return the SMS message, and say whether the text is "not spam" or "spam".
def sms_prediction(text, model):
    """
    Predict the spam/ham classification of a given text message using a pre-trained model.

    Parameters:
    - text (str): The text message to be classified.

    Returns:
    - str: A message indicating whether the text message is classified as spam or not.

    This function takes a text message and a pre-trained pipeline model, then predicts the
    spam/ham classification of the text. The result is a message stating whether the text is
    classified as spam or not.
    """

    try:
        # Create a variable that will hold the prediction of a new text.
        prediction = model.predict([text])[0]

        # Using a conditional if the prediction is "ham" return the message:
        # f'The text message: "{text}", is not spam.' Else, return f'The text message: "{text}", is spam.'
        if prediction == 'ham':
            return f'The text message: "{text}", is not spam.'
        else:
            return f'The text message: "{text}", is spam.'
    except Exception as e:
        return f"Error: {str(e)}"



In [51]:
def sms_app(text):
    return sms_prediction(text, text_clf)

# Provide labels and placeholders for each textbox.
input_textbox = gr.Textbox(
    label="Enter SMS Text",
    placeholder="Type your SMS message here...",
    interactive=True
)

output_textbox = gr.Textbox(
    label="Prediction Result",
    placeholder="The prediction will appear here...",
    interactive=False
)

# Optionally, you can add a description for the app.
description = "This application classifies SMS messages as spam or not spam. Enter your message above and click Submit!"

# Launch the app with a title and description.
gr.Interface(
    fn=sms_app,
    inputs=input_textbox,
    outputs=output_textbox,
    title="SMS Spam Classification",
    description=description
).launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2a20f3f4d622ec2310.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## Test the following text messages.

---

1. You are a lucky winner of $5000! **- This was not flagged as Spam.  I tried to overfit the model but did not change results.  I assume it is a training issue.**
2. You won 2 free tickets to the Super Bowl. **- This was flagged as spam.**
3. You won 2 free tickets to the Super Bowl text us to claim your prize. **- This was flagged as spam.**
4. Thanks for registering. Text 4343 to receive free updates on medicare. **- This was flagged as spam.**