<a href="https://colab.research.google.com/github/hfwalvir/Python-PortfolioProjects/blob/main/Spam_data_identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# DATA COLLECTION AND PREPROCESSING

In [None]:
raw_mail_data = pd.read_csv('/content/sample_data/mail_data.csv')

In [None]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
# replacing null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
mail_data.shape

(5572, 2)

# **LABEL** **ENCODING**

In [None]:
#Label spam mail as 0 and ham mail as 1
mail_data.loc[mail_data['Category']=='spam','Category',] = 0

In [None]:
mail_data.loc[mail_data['Category']=='ham','Category',] = 1

In [None]:
#features and label
X = mail_data['Message']
Y = mail_data['Category']

# **TRAIN AND TEST DATA**

In [None]:
#training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


#FEATURE EXTRACTION

In [None]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words= 'english',lowercase=True)
X_train_features  = feature_extraction.fit_transform(X_train)
X_test_features  = feature_extraction.transform(X_test)
#converting y test and y train to integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(X_train_features)


  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

# **TRAINING THE ML MODELS**

In [None]:
lr_model = LogisticRegression()

In [None]:
#training the LOGISTIC REGRESSION MODEL WITH THE TRAINING DATA
lr_model.fit(X_train_features,Y_train)


In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb_model = MultinomialNB()

In [None]:
mnb_model.fit(X_train_features,Y_train)


# **Evaluating the model**

In [None]:
#Prediction on training data
pred_training_data = lr_model.predict(X_train_features)
accuracy_training = accuracy_score(Y_train,pred_training_data)

In [None]:
print(accuracy_training)

0.9670181736594121


In [None]:
#prediction on test data
pred_test_data = mnb_model.predict(X_test_features)
accuracy_test = accuracy_score(Y_test,pred_test_data)
print(accuracy_test)

0.9730941704035875


In [None]:
nmb_pred = mnb_model.predict(X_test_features)
accuracy_test= (accuracy_score(Y_test,nmb_pred))
print(accuracy_test)

0.9730941704035875


# **BUILDING A PREDICTIVE SYSTEM**

In [None]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

In [None]:
#convert text to vestors
input_data_features= feature_extraction.transform(input_mail)

In [None]:
#Making prediction
prediction = lr_model.predict(input_data_features)
print(prediction)

if (prediction[0] == 1):
  print("Ham mail")
else:
  print("Spam mail")

[1]
Ham mail


In [None]:
prediction = mnb_model.predict(input_data_features)
if (prediction[0]==1):
  print('Ham email')
else :
  print('Spam email')

Ham email
