In [24]:
import pandas as pd 
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [25]:
import os 
import warnings
warnings.filterwarnings(action='ignore')

data = pd.read_csv("english_mail.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
data.loc[data['Category'] == 'ham', 'Category'] = 1
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
data.isna().sum()

Category    0
Message     0
dtype: int64

In [28]:
text_features = TfidfVectorizer(
    min_df = 1,
    stop_words = 'english', #default value
    lowercase = True
)
data_text = text_features.fit_transform(data['Message'])

data_dict = text_features.get_feature_names_out()
data['Category'] = data['Category'].astype(int)

data_dict[4224]

'joys'

In [31]:
params = {
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],
    'C': np.logspace(-2, 2, 10)
}

cv = KFold(n_splits=4, shuffle=True)
searcher = GridSearchCV(LogisticRegression(max_iter=5000), cv=cv, n_jobs=-1, scoring='precision', param_grid=params)
searcher.fit(data_text, data['Category'])

searcher.best_params_

{'C': np.float64(100.0), 'penalty': 'l2', 'solver': 'saga'}

In [32]:
model = LogisticRegression(C=100, penalty='l2', solver='saga')

cv_score = cross_val_score(model, data_text, data['Category'], scoring='precision', cv=7)
print("CV score:\n\t", "\n\t".join("%.4f" % x for x in cv_score))
print('Mean CV: %.4f' % np.mean(cv_score))

CV score:
	 0.9828
	0.9801
	0.9828
	0.9787
	0.9759
	0.9829
	0.9772
Mean CV: 0.9801


In [34]:
from sklearn.pipeline import Pipeline
import pickle

pipe = Pipeline(steps=[
    ('tf-idf', text_features),
    ('model', model)
])

with open('../pickles/pipeline.pkl', 'wb') as f:
    pickle.dump(pipe, f)