In [7]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
df = pd.read_csv('https://raw.githubusercontent.com/khushipitroda1/Datasets/main/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# counting the number of missing values in the dataset
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [11]:
df.Category = df.Category.map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Stemming:

Stemming is the process of reducing a word to its Root word

example: actor, actress, acting --> act

In [12]:
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
df['Message'] = df['Message'].apply(stemming)

In [14]:
#separating the data and label
X = df['Message'].values
Y = df['Category'].values

In [15]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [20]:
#How about different models with different hyperparameters?

from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,20,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}
scores = []


for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X, Y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df



Unnamed: 0,model,best_score,best_params
0,svm,0.9799,"{'C': 20, 'kernel': 'linear'}"
1,random_forest,0.97308,{'n_estimators': 10}
2,logistic_regression,0.978284,{'C': 10}


In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)
model = LogisticRegression()
model.fit(X_train, Y_train)
model.score(X_test,Y_test)

0.9650224215246637

In [19]:
string = input('Enter your news: ')
string = stemming(string)
ans = model.predict(vectorizer.transform([string]))[0]

if(ans == 0):
  print('This is Ham email')
else:
  print('This is Spam email')

Enter your news: hey friend need help
This is Ham email


In [18]:
# save model
import pickle
filename = 'finalized_model.pkl'
pickle.dump(model, open(filename, 'wb'))
# save vectorizer
filename = 'vectorizer.pkl'
pickle.dump(vectorizer, open(filename, 'wb'))