# NLP

In [2]:
import pandas as pd 
import regex as reg
import re
import matplotlib.pyplot as plt
import unicodedata
import nltk
import string
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection, svm
from sklearn.metrics import classification_report, accuracy_score

from nltk.corpus import stopwords

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')

In [4]:
df.sample(10)

Unnamed: 0,url,is_spam
1243,https://www.usatoday.com/story/news/politics/2...,False
1854,https://www.morningbrew.com/daily/stories/2020...,False
2560,https://apnews.com/b50a256d97b7199ea2e8b0530c2...,False
980,https://news.trust.org/item/20200624154945-q8zfe/,False
1221,https://www.coronavirus.gov/,True
144,https://www.ft.com/content/eae603a4-a369-4801-...,False
1540,https://briefingday.com/m/v4n3i4f3,True
85,https://www.wired.co.uk/article/coronavirus-be...,False
539,https://bigspaceship.us9.list-manage.com/unsub...,True
755,https://www.marketwatch.com/story/dow-futures-...,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [6]:
df.describe()

Unnamed: 0,url,is_spam
count,2999,2999
unique,2369,2
top,https://www.bloomberg.com/tosv2.html,False
freq,26,2303


In [7]:
df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x == True else 0)

In [8]:
df['is_spam'].value_counts()

0    2303
1     696
Name: is_spam, dtype: int64

In [9]:
df = df.drop_duplicates()
df = df.reset_index(inplace = False)[['url','is_spam']]
df.shape

(2369, 2)

In [10]:
df['url'] = df['url'].str.lower()

In [11]:
data = df.copy()

In [12]:
cleaner = []

for p in range(len(data.url)):
    desc = data['url'][p]
    
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    desc=re.sub("(\\d|\\W)+"," ",desc)
    
    cleaner.append(desc)

data['url'] = cleaner
        
data.head()

Unnamed: 0,url,is_spam
0,https briefingday us list manage com unsubscribe,1
1,https www hvper com,1
2,https briefingday com m v n i f,1
3,https briefingday com n m commentform,0
4,https briefingday com fan,1


In [13]:
data['url'].str.split(expand=True).stack().value_counts()[:60]

https          2336
com            2065
www            1512
the             354
html            296
news            274
a               252
us              248
to              218
of              173
coronavirus     172
e               150
org             146
c               136
article         131
b               124
in              115
and             113
morningbrew     105
story           105
nytimes         101
on              101
daily            99
d                98
stories          94
utm              90
for              90
youtube          89
v                89
trump            88
numlock          87
watch            86
f                78
is               77
new              76
p                69
world            68
substack         68
reuters          65
covid            63
s                62
briefingday      61
index            61
vox              59
en               59
articles         58
cnn              58
iduskbn          58
politics         56
co               56


In [14]:
stop_words = ['http','www','com','you','your','for','not','have','is','in','im','from','to','https','e','c','v','b','f','p']

In [15]:
for i in data['url'].str.split(expand=True).stack().value_counts().index:
    if len(i)<3 :
        stop_words.append(i)

In [16]:
stop_words=list(set(stop_words))
len(stop_words)

268

In [17]:
def remove_stopwords(message):
  if message is not None:
    words = message.strip().split()
    words_filtered = []
    for word in words:
      if word not in stop_words:
        words_filtered.append(word) 
    result = " ".join(words_filtered)         
  else:
    result = None

  return result 

In [18]:
data['url']=data['url'].apply(remove_stopwords)

In [19]:
data.sample(5)

Unnamed: 0,url,is_spam
2285,wikipedia org wiki goodhart law,0
1317,wired article mask shaming,0
1579,morningbrew daily stories practice selling buy...,0
277,inverse mind body the first stay home dad,0
1037,axios aca enrollment coronavirus html,0


In [20]:
data['url'].str.split(expand=True).stack().value_counts()[:60]

the            354
html           296
news           274
coronavirus    172
org            146
article        131
and            113
morningbrew    105
story          105
nytimes        101
daily           99
stories         94
utm             90
youtube         89
trump           88
numlock         87
watch           86
new             76
substack        68
world           68
reuters         65
covid           63
briefingday     61
index           61
vox             59
cnn             58
articles        58
iduskbn         58
politics        56
cnbc            54
sunday          51
business        49
court           48
apnews          47
email           46
facebook        46
health          45
bbc             41
supreme         41
blog            40
are             40
black           39
medium          39
npr             38
police          38
digg            37
campaign        37
with            37
why             36
theverge        35
apple           35
cases           35
theguardian 

In [22]:
X = data['url']
y = data['is_spam']
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=123)

In [23]:
vec = CountVectorizer(stop_words='english')
X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

In [24]:
nb = MultinomialNB()

In [25]:
nb.fit(X_train, y_train)

In [26]:
predictions = nb.predict(X_train)
print(classification_report(y_train, predictions))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1593
           1       0.82      0.69      0.75       183

    accuracy                           0.95      1776
   macro avg       0.89      0.84      0.86      1776
weighted avg       0.95      0.95      0.95      1776



In [27]:
predictions = nb.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.94      0.99      0.97       532
           1       0.87      0.44      0.59        61

    accuracy                           0.94       593
   macro avg       0.91      0.72      0.78       593
weighted avg       0.93      0.94      0.93       593



In [28]:
message_vectorizer = CountVectorizer().fit_transform(df['url'])

X_train, X_test, y_train, y_test = train_test_split(message_vectorizer, df['is_spam'], test_size = 0.2, random_state = 121, shuffle = True)

In [29]:
cl = svm.SVC(C=1.0, kernel='linear', degree=4, gamma='auto')

In [30]:
cl.fit(X_train, y_train)

pred = cl.predict(X_train)
print(classification_report(y_train, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1701
           1       1.00      0.98      0.99       194

    accuracy                           1.00      1895
   macro avg       1.00      0.99      0.99      1895
weighted avg       1.00      1.00      1.00      1895



In [31]:
pred = cl.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       424
           1       0.74      0.70      0.72        50

    accuracy                           0.94       474
   macro avg       0.85      0.84      0.84       474
weighted avg       0.94      0.94      0.94       474



In [32]:
pickle.dump(cl, open('../models/nlp_model.pkl', 'wb'))