In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
spam_df = pd.read_csv("spam.csv", encoding='latin')

In [3]:
spam_df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
spam_data = spam_df.iloc[:,:2]

In [5]:
spam_data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
spam = pd.DataFrame({"Text": spam_data.iloc[:,1], "Target":spam_data.iloc[:,0]})

In [7]:
spam

Unnamed: 0,Text,Target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will Ì_ b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


In [8]:
import spacy
spacy.cli.download("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
import spacy
nlp = spacy.load("en_core_web_md")
def clean_data(text):
  doc = nlp(text)
  return " ".join(token.lemma_ for token in doc)

In [10]:
spam['clean_text'] = spam.Text.apply(lambda x: clean_data(x))

In [11]:
spam

Unnamed: 0,Text,Target,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,"go until jurong point , crazy .. available onl..."
1,Ok lar... Joking wif u oni...,ham,ok lar ... joke wif u oni ...
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...,ham,U dun say so early hor ... U c already then sa...
4,"Nah I don't think he goes to usf, he lives aro...",ham,"nah I do not think he go to usf , he live arou..."
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam,this be the 2nd time we have try 2 contact u. ...
5568,Will Ì_ b going to esplanade fr home?,ham,will Ì _ b go to esplanade fr home ?
5569,"Pity, * was in mood for that. So...any other s...",ham,"pity , * be in mood for that . so ... any othe..."
5570,The guy did some bitching but I acted like i'd...,ham,the guy do some bitching but I act like I woul...


In [12]:
spam = spam[['clean_text','Target']]

In [13]:
spam

Unnamed: 0,clean_text,Target
0,"go until jurong point , crazy .. available onl...",ham
1,ok lar ... joke wif u oni ...,ham
2,free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor ... U c already then sa...,ham
4,"nah I do not think he go to usf , he live arou...",ham
...,...,...
5567,this be the 2nd time we have try 2 contact u. ...,spam
5568,will Ì _ b go to esplanade fr home ?,ham
5569,"pity , * be in mood for that . so ... any othe...",ham
5570,the guy do some bitching but I act like I woul...,ham


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vec = TfidfVectorizer(stop_words = 'english', min_df=0.01) #min_df means drop all those words which are not present atleast 1% of the data
vec.fit(spam['clean_text'])
vectorized_data = vec.transform(spam['clean_text'])

In [16]:
vectorized_data = vectorized_data.toarray()

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
spam['Target'] = le.fit_transform(spam['Target'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spam['Target'] = le.fit_transform(spam['Target'])


In [18]:
spam['Target']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Target, Length: 5572, dtype: int64

In [19]:
x_train, x_test, y_train, y_test = train_test_split(vectorized_data, spam['Target'], test_size = 0.3, random_state = 42)

In [20]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [21]:
y_test

3245    0
944     0
1044    1
2484    0
812     1
       ..
2505    0
2525    0
4975    0
650     0
4463    0
Name: Target, Length: 1672, dtype: int64

In [22]:
y_pred = lr.predict(x_test)

In [23]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1453
           1       0.92      0.71      0.80       219

    accuracy                           0.95      1672
   macro avg       0.94      0.85      0.89      1672
weighted avg       0.95      0.95      0.95      1672



In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf = RandomForestClassifier()

In [27]:
rf.fit(x_train, y_train)

In [28]:
rf_pred = rf.predict(x_test)

In [29]:
rf_pred

array([0, 0, 1, ..., 0, 0, 0])

In [30]:
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1453
           1       0.91      0.82      0.87       219

    accuracy                           0.97      1672
   macro avg       0.94      0.91      0.92      1672
weighted avg       0.97      0.97      0.97      1672



GENERATING REAL TIME PREDICTION OF SPAM or NOT

In [38]:
new_messege = "Since u have won a cash of 10000. pls call our customer service and share the otp to get."

In [39]:
clean_messege = clean_data(new_messege)

In [40]:
vec.transform([clean_messege])

<1x111 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [41]:
new = vec.transform([clean_messege]).toarray()

In [44]:
if lr.predict(new)[0] == 1:
  print("Spam")
else:
  print("Not Spam")

Spam


In [45]:
messege = "Hey Brother. How are you?"

In [48]:
clean_messege = clean_data(messege)

In [49]:
vec.transform([clean_messege])

<1x111 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [50]:
new = vec.transform([clean_messege]).toarray()

In [51]:
if lr.predict(new)[0] == 1:
  print("Spam")
else:
  print("Not Spam")

Not Spam
