In [6]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:


# Example SMS dataset (you will replace this with your own dataset)
data = [
    {"sms": "Your account has been blocked. Call us immediately.", "label": "fraud"},
    {"sms": "Congratulations! You have won a lottery. Click here to claim.", "label": "fraud"},
    {"sms": "Hello, your monthly statement is ready for viewing.", "label": "non-fraud"},
    {"sms": "Security alert: Your account was accessed from an unrecognized device.", "label": "fraud"},
    {"sms": "Your credit card bill is due in 5 days. Please make the payment.", "label": "non-fraud"},
    {"sms": "Suspicious activity detected on your account. Please confirm if it was you.", "label": "fraud"},
    {"sms": "Your insurance premium has been successfully paid.", "label": "non-fraud"},
    {"sms": "Immediate action required: Your account has been compromised.", "label": "fraud"},
]

# Convert data to DataFrame
df = pd.DataFrame(data)

# Preprocessing: Tokenization and stopwords removal
stop_words = set(stopwords.words('english'))

def preprocess_sms(sms):
    # Tokenize the SMS
    tokens = word_tokenize(sms.lower())
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(filtered_tokens)

# Apply preprocessing
df['sms'] = df['sms'].apply(preprocess_sms)

# Feature Extraction: TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['sms'])

# Labels: Fraud or Non-Fraud
y = df['label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model: Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)



# Function to Detect Fraud in New SMS
def detect_fraud(new_sms):
    new_sms_processed = preprocess_sms(new_sms)
    new_sms_vectorized = vectorizer.transform([new_sms_processed])
    prediction = model.predict(new_sms_vectorized)
    return prediction[0]

# Example usage:
test_sms = "Your bank account has been compromised. Call this number immediately."
# test_sms = "Your Insurance premium has been successfully paid."

print("Fraud Prediction:", detect_fraud(test_sms))  # It should return 'fraud' if it's a fraud message


Fraud Prediction: fraud
