In [None]:
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
d=pd.read_csv("/content/drive/MyDrive/dataset/emails.csv")
d
d.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
d.shape

(5728, 2)

In [None]:
d.columns

Index(['text', 'spam'], dtype='object')

In [None]:
def clean_text(text):
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    tokens = re.split(r"\s", text)

    tokens = [token.lower() for token in tokens]

    stop_words = set(['the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'it', 'with', 'for'])
    filtered_tokens = [token for token in tokens if token not in stop_words]

    clean_text = ' '.join(filtered_tokens)


    return clean_text

d['clean_text'] = d['text'].apply(clean_text)
d

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...
3,Subject: 4 color printing special request add...,1,subject 4 color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fro...
...,...,...,...
5723,Subject: re : research and development charges...,0,subject re research development charges gpg he...
5724,"Subject: re : receipts from visit jim , than...",0,subject re receipts from visit jim thanks agai...
5725,Subject: re : enron case study update wow ! a...,0,subject re enron case study update wow all on ...
5726,"Subject: re : interest david , please , call...",0,subject re interest david please call shirley ...


In [None]:
d['clean_text']

0       subject naturally irresistible your corporate ...
1       subject stock trading gunslinger fanny merrill...
2       subject unbelievable new homes made easy im wa...
3       subject 4 color printing special request addit...
4       subject do not have money get software cds fro...
                              ...                        
5723    subject re research development charges gpg he...
5724    subject re receipts from visit jim thanks agai...
5725    subject re enron case study update wow all on ...
5726    subject re interest david please call shirley ...
5727    subject news aurora 5 2 update aurora version ...
Name: clean_text, Length: 5728, dtype: object

In [None]:
d['clean_text'] = d['clean_text'].str.replace('subject ', '')
d['clean_text']
d

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,naturally irresistible your corporate identity...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill but muz...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy im wanting sh...
3,Subject: 4 color printing special request add...,1,4 color printing special request additional in...
4,"Subject: do not have money , get software cds ...",1,do not have money get software cds from here s...
...,...,...,...
5723,Subject: re : research and development charges...,0,re research development charges gpg here forwa...
5724,"Subject: re : receipts from visit jim , than...",0,re receipts from visit jim thanks again invita...
5725,Subject: re : enron case study update wow ! a...,0,re enron case study update wow all on same day...
5726,"Subject: re : interest david , please , call...",0,re interest david please call shirley crenshaw...


In [None]:
X = d['clean_text']
y = d['spam']
X

0       naturally irresistible your corporate identity...
1       stock trading gunslinger fanny merrill but muz...
2       unbelievable new homes made easy im wanting sh...
3       4 color printing special request additional in...
4       do not have money get software cds from here s...
                              ...                        
5723    re research development charges gpg here forwa...
5724    re receipts from visit jim thanks again invita...
5725    re enron case study update wow all on same day...
5726    re interest david please call shirley crenshaw...
5727    news aurora 5 2 update aurora version 5 2 fast...
Name: clean_text, Length: 5728, dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train

3931    volatility curves linked from reuters hi tanya...
4056    organizational announcement fyi forwarded by o...
4546    lng meeting hello all lng meeting was be held ...
2471    fwd re optical network engineering enron resea...
2481    re argentina modelling michael what about 1 00...
                              ...                        
3772    john sherriff s copper position ted bjorn any ...
5191    national forum on corporate finance mr fastow ...
5226    re my first draft quentin i forwarded your res...
5390    why johan dahl mri energy staffing group vince...
860     perfect visual solution your business now work...
Name: clean_text, Length: 4009, dtype: object

In [None]:
X_test

4445    re energy derivatives conference may 29 toront...
4118    financial maths course part 2 vince just case ...
3893    re bullet points please respond hi vince thank...
4210    re enron default swaps darrell i am sending yo...
5603    re power question steve elena chilkina can giv...
                              ...                        
1221    well trry hello welcome pharmon wallow line s ...
4464    alp presentation hi vince i ll take care invit...
1119    perfect logo charset koi 8 r thinking breathin...
103     sshs get low cost software cds or download fin...
5153    seismic data via satellite i am preparing summ...
Name: clean_text, Length: 1719, dtype: object

In [None]:
feature_extraction = TfidfVectorizer(max_df=0.7, stop_words='english')
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [None]:
y_test, y_train

(4445    0
 4118    0
 3893    0
 4210    0
 5603    0
        ..
 1221    1
 4464    0
 1119    1
 103     1
 5153    0
 Name: spam, Length: 1719, dtype: int64,
 3931    0
 4056    0
 4546    0
 2471    0
 2481    0
        ..
 3772    0
 5191    0
 5226    0
 5390    0
 860     1
 Name: spam, Length: 4009, dtype: int64)

In [None]:
print(X_test_features)

  (0, 31089)	0.04747933728973281
  (0, 30894)	0.06253326570712785
  (0, 30601)	0.043155792939259925
  (0, 30494)	0.16306446255602183
  (0, 30137)	0.0527751541706852
  (0, 30134)	0.019424481659237547
  (0, 30093)	0.05752309424798538
  (0, 29797)	0.08304074645391432
  (0, 29733)	0.05524078273985635
  (0, 29519)	0.037031309346207894
  (0, 28749)	0.2232923619356294
  (0, 28356)	0.024083416069257266
  (0, 28353)	0.03213452403653875
  (0, 28339)	0.1039127714075673
  (0, 28162)	0.048569934228263834
  (0, 28029)	0.08153223127801092
  (0, 27401)	0.06177137598544647
  (0, 27139)	0.06912111747685475
  (0, 26765)	0.046548462970856526
  (0, 26752)	0.1687465026916976
  (0, 26657)	0.040180939398178274
  (0, 26122)	0.0636362955654572
  (0, 26108)	0.04623376065407368
  (0, 26065)	0.06827469833940071
  (0, 25751)	0.04895818833665853
  :	:
  (1718, 18151)	0.06060628581729134
  (1718, 17311)	0.11465254205171307
  (1718, 17161)	0.09879031728781391
  (1718, 17157)	0.09173339611565713
  (1718, 16764)	0.15060

In [None]:
print(X_train_features)

  (0, 1370)	0.14595028518129705
  (0, 1928)	0.22189033093639085
  (0, 2668)	0.3791606253762144
  (0, 9823)	0.1588997419216165
  (0, 12270)	0.11524022212250487
  (0, 4220)	0.12099531897343449
  (0, 24014)	0.06899457635066793
  (0, 3676)	0.12429227032823445
  (0, 22170)	0.05660792126197912
  (0, 27983)	0.0921502722625152
  (0, 18858)	0.17567905026864852
  (0, 7929)	0.10430587485253556
  (0, 9166)	0.11901041623760268
  (0, 8964)	0.12073868992266647
  (0, 12890)	0.06993889496353764
  (0, 14549)	0.12231822467814801
  (0, 7474)	0.1803813067810224
  (0, 17770)	0.09326992416332779
  (0, 30512)	0.08254984180336765
  (0, 31005)	0.11195152521066457
  (0, 8192)	0.15204698976549946
  (0, 28191)	0.1895803126881072
  (0, 28433)	0.1600146859600325
  (0, 17581)	0.13992512296420967
  (0, 11687)	0.1578020866164291
  :	:
  (4008, 30576)	0.1049842833317911
  (4008, 27122)	0.13772942549782247
  (4008, 17748)	0.13772942549782247
  (4008, 27088)	0.10119195655748675
  (4008, 21658)	0.13847708564073213
  (4008,

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_features, y_train)

In [None]:
prediction_on_training_data = model.predict(X_train_features)

In [None]:
prediction_on_training_data

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
prediction_on_test_data = model.predict(X_test_features)

In [None]:
prediction_on_test_data

array([0, 0, 0, ..., 1, 1, 0])

In [None]:
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [None]:
accuracy_on_training_data

0.9955101022698928

In [None]:
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)
accuracy_on_test_data

0.9726585223967423