Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Copy of Spam_classification.csv')
df.head()

Unnamed: 0,label,origin
0,0,Received: from rodan.UU.NET by aramis.rutgers....
1,1,Received: from unknown (HELO groucho.cs.psu.ed...
2,1,Received: \n\tfrom 24-151-178-89.dhcp.kgpt.tn....
3,0,Received: from psuvax1.cs.psu.edu ([130.203.2....
4,1,Received: from 201-1-198-159.dsl.telesp.net.br...


# DATA PREPROCESSING

In [4]:
nltk.download('stopwords') # we use this to download that particular words that are not important in NLP

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
nltk.download('punkt') # It allows you to tokenize text efficiently.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
df.shape

(37822, 2)

In [8]:
length = df['origin'].str.len()
print(length)

0        3656
1        2135
2        1405
3        1408
4        3042
         ... 
37817    2184
37818    2713
37819    1026
37820    1415
37821    3526
Name: origin, Length: 37822, dtype: int64


In [9]:
m = length.max()

In [10]:
t = length.min()

In [11]:
mean = length.mean()
print(mean)

5004.21130558934


In [16]:
threshold_min = 3000  # take out the particular set in which the length of the string is between 3000 to 50000
threshold_max = 50000
filtered_df = df[df['origin'].str.len().between(threshold_min, threshold_max)]
print(filtered_df)

       label                                             origin
0          0  Received: from rodan.UU.NET by aramis.rutgers....
4          1  Received: from 201-1-198-159.dsl.telesp.net.br...
11         1  Received: from unknown (HELO cbn.org) ([86.108...
18         1  Received: from 201-1-198-159.dsl.telesp.net.br...
28         1  Received: from unknown (HELO CEYLAN.dz3eo.org)...
...      ...                                                ...
37802      1  Received: from idc-ceemea.com (cpe-24-27-107-2...
37805      1  Received: from mail.oh-oku.com (61-30-232-81.s...
37813      1  Received: from 346EE3C8 (unknown [59.92.121.17...
37814      1  Received: from 222.69.161.115 (unknown [222.69...
37821      1  Received: from 3F9CFCB8 (201-43-166-100.dsl.te...

[12472 rows x 2 columns]


In [17]:
filtered_df.isnull().sum() # NO null values, no filling of the empty spaces

label     0
origin    0
dtype: int64

In [18]:
X = filtered_df.drop(columns='label', axis =1)
Y = filtered_df['label']

In [19]:
port_stem = PorterStemmer() # this function helps to perform certain preprocessing on the text data

In [20]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower() # lowercase the text
  tokens = word_tokenize(stemmed_content) # tokenize the words
  stemmed_content = stemmed_content.split() # split it
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] # remove the stopwords
  stemmed_content = ' '.join(stemmed_content) #do join words so that td-idf vectorizer workl on it unless it will print the list
  return stemmed_content

In [21]:
filtered_df.loc[:, 'origin'] = filtered_df['origin'].apply(stemming) # we apply the stemming function on the origin of the filtered data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, 'origin'] = filtered_df['origin'].apply(stemming)


In [22]:
print(filtered_df['origin']) # output after stemming

0        receiv rodan uu net arami rutger edu smi ru id...
4        receiv dsl telesp net br helo f f groucho cs p...
11       receiv unknown helo cbn org groucho cs psu edu...
18       receiv dsl telesp net br helo f f groucho cs p...
28       receiv unknown helo ceylan dz eo org groucho c...
                               ...                        
37802    receiv idc ceemea com cpe houston re rr com ku...
37805    receiv mail oh oku com static tfn net tw psi p...
37813    receiv ee c unknown kukui ifa hawaii edu p sun...
37814    receiv unknown kukui ifa hawaii edu p sun esmt...
37821    receiv f cfcb dsl telesp net br kukui ifa hawa...
Name: origin, Length: 12472, dtype: object


In [23]:
X = df['origin'].values
Y = df['label'].values

In [24]:
print(X)

['Received: from rodan.UU.NET by aramis.rutgers.edu (5.59/SMI4.0/RU1.4/3.08)\n\tid AA23563; Mon, 27 Jul 92 22:28:01 EDT\nReceived: from relay2.UU.NET by rodan.UU.NET with SMTP\n\t(5.61/UUNET-mail-drop) id AA06229; Mon, 27 Jul 92 22:27:59 -0400\nReceived: from uunet.uu.net (via LOCALHOST.UU.NET) by relay2.UU.NET with SMTP\n\t(5.61/UUNET-internet-primary) id AA26262; Mon, 27 Jul 92 22:28:08 -0400\nReceived: from sarto.UUCP by uunet.uu.net with UUCP/RMAIL\n\t(queueing-rmail) id 222745.1653; Mon, 27 Jul 1992 22:27:45 EDT\nNewsgroups: soc.religion.christian\nPath: jhpb\nFrom: jhpb@sarto.budd-lake.nj.us (Joseph H. Buehler)\nSubject: new Catholic mailing list now up and running\nMessage-Id: <JHPB.92Jul27221355@sarto.budd-lake.nj.us>\nSender: jhpb@sarto.budd-lake.nj.us (Joseph H Buehler)\nOrganization: none\nDate: Tue, 28 Jul 1992 03:13:55 GMT\nContent-Type: text\nContent-Length: 2745\nApparently-To: <soc-religion-christian>\n\nThe mailing list I queried about a few weeks ago is now up and\nru

In [25]:
print(Y)

[0 1 1 ... 1 1 1]


In [26]:
# converting the textual data to feature vectors
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [27]:
print(X)

  (0, 1429193)	0.009786135448832755
  (0, 1429147)	0.03241197225304772
  (0, 1418568)	0.02245543618389874
  (0, 1417790)	0.020194958423906315
  (0, 1355622)	0.028493171018211277
  (0, 1348094)	0.018955429043212176
  (0, 1348070)	0.019292546876202434
  (0, 1347404)	0.021759980879068788
  (0, 1346030)	0.036102436262704055
  (0, 1345593)	0.023717368199657522
  (0, 1345565)	0.01643171535428194
  (0, 1342088)	0.016911915658363252
  (0, 1342067)	0.04653754222279035
  (0, 1341612)	0.023710932124161664
  (0, 1339976)	0.011355035270452668
  (0, 1329120)	0.03219188728993026
  (0, 1327469)	0.03010303224379228
  (0, 1292269)	0.02061581839389348
  (0, 1287848)	0.015843220260260873
  (0, 1266496)	0.16668790031388206
  (0, 1266190)	0.06672633463241688
  (0, 1265930)	0.23857388373445393
  (0, 1264074)	0.037825242671706436
  (0, 1264040)	0.02623002099341076
  (0, 1263626)	0.031070217110152322
  :	:
  (37821, 147845)	0.02092186040476118
  (37821, 146725)	0.016734216622049387
  (37821, 126141)	0.06835793

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,stratify = Y,random_state = 2) # train-test-split is used to split the data into training data and testing data around the split of 80:20

# MODEL TRAINING

In [29]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
}

In [None]:

for name, model in models.items():

    model.fit(X_train, Y_train)


    y_pred = model.predict(X_test)


    accuracy = accuracy_score(Y_test, y_pred)


    print(f"{name}: Accuracy - {accuracy:.4f}")

Logistic Regression: Accuracy - 0.9905
Decision Tree: Accuracy - 0.9859
Random Forest: Accuracy - 0.9935
Support Vector Machine: Accuracy - 0.9947
