In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


In [35]:
raw_mail_data = pd.read_csv('mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
raw_mail_data.shape



(5572, 2)

In [37]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
raw_mail_data.shape

(5572, 2)

In [39]:
mail_data.loc[mail_data['Category'] == 'spam' , 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham' , 'Category'] = 1
mail_data['Category'].value_counts()

Category
1    4825
0     747
Name: count, dtype: int64

In [40]:
X = mail_data["Message"]
Y = mail_data["Category"]


x_train , X_test , y_train, y_test = train_test_split(X,Y , test_size= 0.2 , stratify=Y , random_state=2)



In [41]:
feature_ext = TfidfVectorizer(min_df=1 , stop_words='english' , lowercase=True)
x_train_features = feature_ext.fit_transform(x_train)
x_test_features = feature_ext.transform(X_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [42]:
print(x_train_features)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34895 stored elements and shape (4457, 7496)>
  Coords	Values
  (0, 4768)	0.2885879313347367
  (0, 7438)	0.2996693624522654
  (0, 2262)	0.49316930861935127
  (0, 3764)	0.22046319970004669
  (0, 2823)	0.5172500796081709
  (0, 7289)	0.5172500796081709
  (1, 3317)	0.3290434493347565
  (1, 4972)	0.49481520325330874
  (1, 1558)	0.42364007209989546
  (1, 6517)	0.49481520325330874
  (1, 4136)	0.4717788963273523
  (2, 3103)	0.17628376831968728
  (2, 841)	0.26799944639874834
  (2, 4099)	0.186263215205624
  (2, 3086)	0.27449720225122765
  (2, 2136)	0.180851695270251
  (2, 3398)	0.20665621299033204
  (2, 4269)	0.2543939099135892
  (2, 3118)	0.18009671431232455
  (2, 3935)	0.3671145612703168
  (2, 3722)	0.24768901862403342
  (2, 6641)	0.20096909705626312
  (2, 1430)	0.28509060215711635
  (2, 5837)	0.1845655907506494
  (2, 4943)	0.33789703751914013
  :	:
  (4454, 841)	0.21705430485365426
  (4454, 3514)	0.17954863693268575
  (4454, 7163)	

In [43]:
#training
model = LogisticRegression()
model.fit(x_train_features , y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [44]:

prediction = model.predict(x_train_features)
accuracy = accuracy_score(y_train,prediction)
print(accuracy)

0.9672425398249944


In [45]:
prediction_test = model.predict(x_test_features)
accuracy_test = accuracy_score(prediction_test , y_test)
print(accuracy_test)

0.9704035874439462


In [None]:
user_in = [""]
user_in_features = feature_ext.transform(user_in)
prediction = model.predict(user_in_features)
print(prediction)
if(prediction[0] == 1):
    print("genuine mail")
else:
    print("spam mail")
 

[0]
spam mail
