<a href="https://colab.research.google.com/github/indhuv27/machine-learning-codsoft/blob/main/sms_spam_detection_task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTING THE MODULES

In [1]:
import pandas as pd
import nltk
import re
import numpy as np
from nltk.corpus import stopwords




LOADING THE DATASET

In [2]:
# LOADING THE DATASET
# Try reading the CSV with 'latin-1' encoding
try:
    df = pd.read_csv('/content/spam.csv', encoding='latin-1')
except UnicodeDecodeError:
    # If 'latin-1' fails, try another common encoding like 'ISO-8859-1'
    try:
        df = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')
    except UnicodeDecodeError:
        # If both fail, you might need to investigate the file encoding further
        print("Could not decode the file using 'latin-1' or 'ISO-8859-1'. Please check the file encoding.")

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# get necessary columns for processing
df = df[['v1', 'v2']]

In [5]:
df=df.rename(columns={'v1':'label','v2':'messages'})

PREPARING THE DATASET

In [6]:
# check for null values
df.isnull().sum()

Unnamed: 0,0
label,0
messages,0


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
stopwords=set(stopwords.words('english'))
def clean_text(text):
  text=re.sub('[^a-zA-Z]',' ',text)
  text=text.lower()
  text=text.split()
  text=[word for word in text if word not in stopwords]
  return " ".join(text)







In [9]:
# clean the messages
df['clean_text']=df['messages'].apply(clean_text)
df.head()

Unnamed: 0,label,messages,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


INPUT SPLIT

In [10]:
x= df['clean_text']
y=df['label']

MODEL TRAINING

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

def classify(model,x,y):
  # train test split
  x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42,shuffle=True,stratify=y)
  # model training
  pipeline_model=Pipeline([('vect',CountVectorizer()),
                           ('tfidf',TfidfTransformer()),
                           ('clf',model)])
  pipeline_model.fit(x_train,y_train)
  print('Accuracy:',pipeline_model.score(x_test,y_test)*100)
  y_pred=pipeline_model.predict(x_test)
  print(classification_report(y_test,y_pred))
  return pipeline_model









In [12]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
classify(LogisticRegression(),x,y)



Accuracy: 96.8413496051687
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       0.99      0.77      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393

