Download training dataset: [fraud_call.file](https://www.kaggle.com/code/narayanyadav/detect-fraud-call/data)

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('fraud_call.file',
                   sep='\t',
                   header=None, on_bad_lines='skip',
                   names=['label','content'])

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords # non informative most common words like 'the', 'is'
from sklearn.feature_extraction.text import TfidfVectorizer

lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer(max_features=1500)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [9]:
import re
from num2words import num2words

def remove_special_characters(column):
    special_characters = r'[£$&+,:;=?@#|<>.^*()%!-]'
    return re.sub(special_characters, '', column)

def remove_special_escaped(column):
    escaped = ['&lt;', '&gt;', '&amp;']
    for i in escaped: column = column.replace(i, '')
    return column

def many_nums(nums : str):
    return ' '.join([num2words(i) for i in nums])

def less_nums(nums : str):
    return num2words(nums)

def cost_speaked(column):
    digits_comma_separated = r"((\d*\.?\d+|\d{1,3}(,\d{3})*(\.\d+)?))"
    return re.sub(digits_comma_separated,
                 lambda x: less_nums(x.group()), column)

def phones_speaked(column):
    more_than_four_digits = r"\d{4,}"
    return re.sub(more_than_four_digits, 
                  lambda x: many_nums(x.group()), column)

In [7]:
cost_speaked('2000').upper()

'TWO THOUSAND'

In [43]:
remove_special_characters(data.content[1].lower())

'todays vodafone numbers ending with 4882 are selected to a receive a 350 award if your number matches call 09064019014 to receive your 350 award'

In [37]:
# firstly I think to use .replace but it operates only full column content
data_speaked = data[['content']].applymap(lambda x: x.lower())\
                                .applymap(remove_special_escaped)\
                                .applymap(remove_special_characters)\
                                .applymap(phones_speaked)\
                                .applymap(cost_speaked)

In [38]:
data_speaked.content[1]

'todays vodafone numbers ending with four eight eight two are selected to a receive a three hundred and fifty award if your number matches call zero nine zero six four zero one nine zero one four to receive your three hundred and fifty award'

In [40]:
def lemmatize(column):
    return ' '.join([lemmatizer.lemmatize(word) for word in column.split() \
                    if word not in stopwords.words('english')])

In [41]:
data_lemmatized = data_speaked.applymap(lemmatize)
data_lemmatized.content[1]

'today vodafone number ending four eight eight two selected receive three hundred fifty award number match call zero nine zero six four zero one nine zero one four receive three hundred fifty award'

In [23]:
data_final = data_lemmatized.copy()
data_final['label'] = pd.get_dummies(data.label).fraud
data_final.to_csv('fraud_clear.csv') #remove junk lines (with decimal, etc) later 
data_final.content

0       hello bank manager sbi ur debit card expire wo...
1       today vodafone number ending four eight eight ...
2                                please say like hi hi hi
3                                                   thank
4                       oh forwarded message thought send
                              ...                        
5919    get one zero zero zero inr voucher please call...
5920    get free access google cloud account hit given...
5921    get free aws cloud account hit given message b...
5922    get free access microsoft azure hit given mess...
5923    hello sir bank fill application form credit ca...
Name: content, Length: 5924, dtype: object

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

x = vectorizer.fit_transform(data_final.content).toarray()
x_train, x_test, y_train, y_test = train_test_split(x, data_final.label, 
                                                    test_size=.25, random_state=50)
fraud_classifier = MultinomialNB().fit(x_train, y_train)
fraud_classifier.score(x_test, y_test)

0.9756920999324781

In [28]:
import pickle

with open('classifier.pkl', 'wb') as fid:
    pickle.dump(fraud_classifier, fid)
with open('vectorizer.pkl', 'wb') as fid:
    pickle.dump(vectorizer, fid)