# Gradio End-to-end Machine Learning Tutorial

## 1. Build Machine Learning Model

### a. Exploratory Data Analysis

In [2]:
# Load the data
import pandas as pd

spam_data = pd.read_csv("./data/spam.csv", encoding="latin-1")
spam_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
spam_data = spam_data[['v1', 'v2']]
spam_data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Change columns name and change target values

spam_data.rename(columns = {'v1': 'target', 
                            'v2': 'text'}, 
                inplace = True)

spam_data.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
spam_data['target'] = spam_data['target'].map({'ham': 0, 
                                           'spam': 1})

spam_data.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
print(spam_data.iloc[0]['text'])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


### b. Data Preprocessing

In [8]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zoumanakeita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# 1. Remove punctuations
spam_data['text'] = spam_data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# 2. Convert to lowercase
spam_data['text'] = spam_data['text'].apply(lambda x: x.lower())

# 3. Remove stopwords
stop_words = set(stopwords.words('english'))

spam_data['text'] = spam_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# 4. Remove numbers
spam_data['text'] = spam_data['text'].apply(lambda x: re.sub(r'\d+', '', x))

# 5. Remove words less than 2 letters
spam_data['text'] = spam_data['text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))


In [10]:
spam_data.head()

Unnamed: 0,target,text
0,0,jurong point crazy available bugis great world...
1,0,lar joking wif oni
2,1,free entry wkly comp win cup final tkts may te...
3,0,dun say early hor already say
4,0,nah dont think goes usf lives around though


In [11]:
print(spam_data.iloc[0]['text'])

jurong point crazy available bugis great world buffet cine got amore wat


### c. Data Build Machine Learning Model

In [13]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [14]:
X = spam_data['text']
y = spam_data['target']
ran_state = 2023
t_size = 0.2 


cv = CountVectorizer()
X = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=t_size, 
                                                    random_state=ran_state, 
                                                    stratify = y) # To make sure "y" target is evenly distributed
                                                                  # Across training and testing data

#### c.1. Logic Regression Model

In [15]:
# 1. Train the model
lr_model = LogisticRegression()

lr_model.fit(X_train, y_train)

# 2. Make prediction
y_lr_model_predict = lr_model.predict(X_test)

In [16]:
# 3. Evaluate the model
print('Logistic Regression:')
print('Accuracy:', accuracy_score(y_test, y_lr_model_predict))
print('Precision:', precision_score(y_test, y_lr_model_predict))
print('Recall:', recall_score(y_test, y_lr_model_predict))
print('F1 score:', f1_score(y_test, y_lr_model_predict))

Logistic Regression:
Accuracy: 0.979372197309417
Precision: 0.9846153846153847
Recall: 0.8590604026845637
F1 score: 0.9175627240143368


#### c.2. Linear SVM Model

In [18]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
y_svm_model_predict = svm_model.predict(X_test)

In [19]:
# Evaluate the model
print('Linear SVM:')
print('Accuracy:', accuracy_score(y_test, y_svm_model_predict))
print('Precision:', precision_score(y_test, y_svm_model_predict))
print('Recall:', recall_score(y_test, y_svm_model_predict))
print('F1 score:', f1_score(y_test, y_svm_model_predict))

Linear SVM:
Accuracy: 0.9820627802690582
Precision: 0.9849624060150376
Recall: 0.8791946308724832
F1 score: 0.9290780141843972


#### c.3. Conclusion.   
The Linear SVM model has the best performance compared to Logistic Regression model

### c. Use model on real-life data

In [20]:
# 1. Serialize the best model
import joblib

In [21]:
model_file_name = "svm_best_model.joblib"
model_folder = "./models/"
joblib.dump(svm_model, model_folder+''+model_file_name)

['./models/svm_best_model.joblib']

In [22]:
# 2. Load the model
loaded_svm_model = joblib.load(open(model_folder+''+model_file_name, 'rb'))

def make_prediction(input_text):
    
    preprocessed_text = cv.transform([input_text])

    prediction = loaded_svm_model.predict(preprocessed_text)

    info = '' 

    if(prediction[0] == 0):
        info = "Ham ✅"

    else:
        info = "Spam 🚨" 
    
    final_info = "This message is a : {}".format(info)
    
    return final_info
    
    
    
text_example = "Thank you for your message. You have won $1000000000 send your bank information asap!!!"

print(make_prediction(text_example))

This message is a : Spam 🚨


## 2. Build Gradio App

In [23]:
#!pip3 install gradio

In [24]:
import gradio as gr

In [26]:
headline = "Spam Detector App"

iface = gr.Interface(fn=make_prediction, inputs="text", outputs="text", title = headline)
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://b4f8e7594bdcb1cae3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


