In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [29]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'Please call me... PLEASE!']

In [30]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

# examine the fitted vocabulary
vect.get_feature_names_out()

array(['cab', 'call', 'me', 'please', 'tonight', 'you'], dtype=object)

In [31]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [32]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [33]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [34]:
# check the type of the document-term matrix
print(type(simple_train_dtm))

# examine the sparse matrix contents
print(simple_train_dtm)

<class 'scipy.sparse._csr.csr_matrix'>
  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [35]:
# example text for model testing
simple_test = ["please don't call me"]

In [36]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [37]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [38]:
#read file into pandas using a relative path
sms = pd.read_csv("final_dataset.csv", on_bad_lines= 'warn')
sms.dropna(how="any", inplace=True, axis =1)
sms.columns = ["is_smishing", "text", "char_length"]

sms.head()

Unnamed: 0,is_smishing,text,char_length
0,0,"Sizt, grbe yung traffic, asar! 🚗 Pagdating mo,...",73
1,0,"Chika lang, gurl! Alam mo ba yung bagong lodi?...",67
2,0,"Uy, dude! Tara sa beach, malupet mag chill dun...",60
3,0,"Bes, yung food trip natin next week, on or off...",70
4,0,"Wazzup, pare? G natin sa club later, solid yun...",50


In [39]:
sms.describe()

Unnamed: 0,is_smishing,char_length
count,6620.0,6620.0
mean,0.220393,83.727039
std,0.414543,58.480882
min,0.0,2.0
25%,0.0,38.0
50%,0.0,67.0
75%,0.0,127.0
max,1.0,910.0


In [40]:
sms.groupby('text').describe()

Unnamed: 0_level_0,is_smishing,is_smishing,is_smishing,is_smishing,is_smishing,is_smishing,is_smishing,is_smishing,char_length,char_length,char_length,char_length,char_length,char_length,char_length,char_length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
text,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
"\t""For the most sparkling shopping breaks from 45 per person; call 0121 2025050 or visit www.shortbreaks.org.uk""",1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,111.0,,111.0,111.0,111.0,111.0,111.0
\t**FREE MESSAGE**Thanks for using the Auction Subscription Service. 18 . 150p/MSGRCVD 2 Skip an Auction txt OUT. 2 Unsubscribe txt STOP CustomerCare 08718726270,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,160.0,,160.0,160.0,160.0,160.0,160.0
\t-PLS STOP bootydelious (32/F) is inviting you to be her friend. Reply YES-434 or NO-434 See her: www.SMS.ac/u/bootydelious STOP? Send STOP FRND to 62468,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,153.0,,153.0,153.0,153.0,153.0,153.0
\t07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,173.0,,173.0,173.0,173.0,173.0,173.0
"\t09066362231 URGENT! Your mobile No 07xxxxxxxxx won a Â£2,000 bonus caller prize on 02/06/03! this is the 2nd attempt to reach YOU! call 09066362231 ASAP!",1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,154.0,,154.0,154.0,154.0,154.0,154.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ãœ thk of wat to eat tonight.,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,29.0,,29.0,29.0,29.0,29.0,29.0
Ãœ v ma fan...,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,14.0,,14.0,14.0,14.0,14.0,14.0
Ãœ wait 4 me in sch i finish ard 5..,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,36.0,,36.0,36.0,36.0,36.0,36.0
â€¦ and donâ€˜t worry weâ€˜ll have finished by march â€¦ ish!,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,61.0,,61.0,61.0,61.0,61.0,61.0


In [41]:
#Data Preprocessing
import string
from nltk.corpus import stopwords


def text_process(mess):
    #takes a string of text, performing the following"
    STOPWORDS = stopwords.words('english')
    #check characters for punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    
    #joining to form string
    nopunc = ''.join(nopunc)
    
    #remove any stopwords
    return ''.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

In [42]:
sms.head()

Unnamed: 0,is_smishing,text,char_length
0,0,"Sizt, grbe yung traffic, asar! 🚗 Pagdating mo,...",73
1,0,"Chika lang, gurl! Alam mo ba yung bagong lodi?...",67
2,0,"Uy, dude! Tara sa beach, malupet mag chill dun...",60
3,0,"Bes, yung food trip natin next week, on or off...",70
4,0,"Wazzup, pare? G natin sa club later, solid yun...",50


In [43]:
#text process convertion with stopwords applied

sms['clean_msg'] = sms.text.apply(text_process)

sms.head()

Unnamed: 0,is_smishing,text,char_length,clean_msg
0,0,"Sizt, grbe yung traffic, asar! 🚗 Pagdating mo,...",73,Siztgrbeyungtrafficasar🚗Pagdatingmokitakitssac...
1,0,"Chika lang, gurl! Alam mo ba yung bagong lodi?...",67,ChikalanggurlAlammobayungbagonglodiAngsayaswea...
2,0,"Uy, dude! Tara sa beach, malupet mag chill dun...",60,UydudeTarasabeachmalupetmagchilldunWavesss🏖️🌊
3,0,"Bes, yung food trip natin next week, on or off...",70,BesyungfoodtripnatinnextweekKungexciteeed🍲🍜
4,0,"Wazzup, pare? G natin sa club later, solid yun...",50,WazzuppareGnatinsaclublatersolidyun🎉🕺


In [44]:
type(stopwords.words('english'))

list

In [45]:
from collections import Counter

words = sms[sms.is_smishing == 0].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
true_words = Counter()

for msg in words:
   true_words.update(msg)
   
print(true_words.most_common(50))

[('ok', 3), ('kamustakana', 2), ('nasabahaydinakongayon', 2), ('anggulongautocorrectngphoneko', 2), ('watru', 2), ('dontwanthearanything', 2), ('u', 2), ('arunutransframt', 2), ('sorryinmeetingillcalllater', 2), ('sirwaitingmail', 2), ('reversecheatingmathematics', 2), ('comingfridayleavepongaldogetnewsworkplace', 2), ('greatesttestcourageearthbeardefeatwithoutlosingheartgntc', 2), ('imcominghome4dinner', 2), ('ucall', 2), ('sorryillcalllaternight', 2), ('xmasstorypeacexmasmsglovexmasmiraclejesushavblessedmonthaheadampwishumerryxmas', 2), ('okiethanx', 2), ('thatscoolgentlemantreatdignityrespect', 2), ('way', 2), ('remainsbroamongstbros', 2), ('signmaturitystartsayingbigthingsactuallystartunderstandingsmallthingsniceeveningbslvyl', 2), ('takeexammarch3', 2), ('beautifultruthgravityreadcarefullyheartfeelslightsomeonefeelsheavysomeoneleaves', 2), ('come', 2), ('camehostel', 2), ('1thingchangesentencewant2concentrateeducationalcareerimleaving', 2), ('oklor', 2), ('siztgrbeyungtrafficasar🚗

In [46]:
words = sms[sms.is_smishing== 1].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
false_words = Counter()

for msg in words:
    false_words.update(msg)
    
print(false_words.most_common(50))

[('usecretadmirerrevealthinksurspecialcall09065174042optreplyrevealstop150permsgrecdcustcare07821230901', 3), ('freemessageactivate500freetextmessagesreplyingmessagewordfreetermsconditionsvisitwww07781482378com', 3), ('urgoing2bahamascallfreefone08081560665speakliveoperatorclaimeitherbahamascruiseofâ£2000cash18onlyopttxtx07786200117', 3), ('mobile11monthsurentitledupdatelatestcolourmobilescamerafreecallmobileupdatecofree08002986030', 3), ('1newvoicemailpleasecall08719181513', 3), ('u447801259231secretadmirerlooking2makecontactufindrrevealthinksurspecialcall09058094597', 3), ('usecretadmirerlooking2makecontactufindrrevealthinksurspecialcall09058094599', 3), ('hellosirmamselectedparttimejobdailysalary30008000contactwsconsultationhttp9cblife44m6z', 2), ('gawinangiyongunangdepositong177psa177betmakakuhang100depositbonusdoblehinangiyongdepositongayonhuwagitongpalampasin177betoxyz', 2), ('bibigyankangmayang100nabonussaiyongunangdeposito177betsxyz', 2), ('joingojackpotsexcitementnewusersclaim

In [47]:
#vectorization
from sklearn.model_selection import train_test_split

#define x and y (from the sms data) to use with COUNTVECTORIZER
X = sms.clean_msg
y = sms.is_smishing
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6620,)
(6620,)
(4965,)
(1655,)
(4965,)
(1655,)


In [48]:
from sklearn.feature_extraction.text import CountVectorizer

#instantiate
v = CountVectorizer()
v.fit(X_train)

#learn training data vocabulary
X_train_dtm = v.transform(X_train)

#equivalently: combine fit and transform into a single step
X_train_dtm = v.fit_transform(X_train)

#examine the document-term matrix
print(type(X_train_dtm), X_train_dtm.shape)

#transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = v.transform(X_test)
print(type(X_test_dtm), X_test_dtm.shape)

<class 'scipy.sparse._csr.csr_matrix'> (4965, 5180)
<class 'scipy.sparse._csr.csr_matrix'> (1655, 5180)


In [49]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

<4965x5180 sparse matrix of type '<class 'numpy.float64'>'
	with 5394 stored elements in Compressed Sparse Row format>

In [50]:
#using logistic regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')

#train using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)


CPU times: total: 31.2 ms
Wall time: 13 ms


In [51]:
#make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

#calculate predicted 
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([0.20043537, 0.20043537, 0.20043537, ..., 0.20043537, 0.20043537,
       0.20043537])

In [53]:
from sklearn import metrics
#accuracy
print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred_class))

#confusion 
print("Confusion Score")
print(metrics.confusion_matrix(y_test, y_pred_class))

#calculate AUC
print("ROC AUC score: ", metrics.roc_auc_score(y_test, y_pred_prob))


Accuracy Score:  0.7903323262839879
Confusion Matrix:  [[1290    0]
 [ 347   18]]
ROC AUC score:  0.6348104491876394
