# HW16

### Author: Joseph Wong

## Import Packages

In [2]:
# Basic package imports
import os
import numpy as np
import pandas as pd

# Visualization packages
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn: Core utilities for model building and evaluation
from sklearn.model_selection import train_test_split    # Train/test data splitting
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler  # Feature transformations and scaling
from sklearn.metrics import (                            # Model evaluation metrics
    mean_squared_error, r2_score, accuracy_score, 
    precision_score, recall_score, confusion_matrix, 
    classification_report
)

# Scikit-learn: Linear and polynomial models
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor       # For KNN

# Scikit-learn: Synthetic dataset generators
from sklearn.datasets import make_classification, make_regression

# Scikit-learn: Naive Bayes models
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# Text Processing Packages and Code
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer

## Import the Data

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

file = path + '/' + os.listdir(path)[0]
df = pd.read_csv(file, encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

Path to dataset files: C:\Users\josee\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [5]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
import nltk
nltk.download('stopwords')

def process(text):
    '''
    Preprocess text by first making sure it is lower case.
    Then remove punctuation and words that are too common (stopwords)

    Stopwords are common words in a language that are usually filtered 
    out before processing text because they carry little semantic meaning for many tasks
    '''
    # lowercase it
    text = text.lower()
    # remove punctuation
    text = ''.join([t for t in text if t not in string.punctuation])
    # remove stopwords
    text = [t for t in text.split() if t not in stopwords.words('english')]
    # stemming
    st = Stemmer()
    text = [st.stem(t) for t in text]
    # return token list
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], 
                                                    df['label'], 
                                                    test_size=0.20, 
                                                    random_state = 16)

In [9]:
# Create the vectorizer
# Tell it to use the process function from above
tfidfv = TfidfVectorizer(analyzer=process)
# Fit and convert our messages
data_train = tfidfv.fit_transform(X_train)

In [12]:
# Create the model
mnb = MultinomialNB()
mnb.fit(data_train,y_train)

In [13]:
data_test = tfidfv.transform(X_test)
preds=mnb.predict(data_test)

In [14]:
count = 0
for i in range(len(y_test)):
    if y_test.iloc[i] != preds[i]:
        count += 1
print('Total number of test cases', len(y_test))
print('Number of wrong of predictions', count)

Total number of test cases 1115
Number of wrong of predictions 40


In [15]:
print(classification_report(preds, y_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1002
        spam       0.74      1.00      0.85       113

    accuracy                           0.96      1115
   macro avg       0.87      0.98      0.91      1115
weighted avg       0.97      0.96      0.97      1115

