In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load dataset
df = pd.read_csv("cleaned_call_logs.csv")

In [4]:
# Preprocessing function
def clean_text(text):
    text = re.sub(r"caller:|receiver:", "", text, flags=re.IGNORECASE)  # Remove labels
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df["cleaned_dialogue"] = df["dialogue"].apply(clean_text)

In [5]:
# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_dialogue"])
y = df["labels"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions and probability
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Probability of scam

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9920634920634921
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        96
           1       0.99      1.00      0.99       156

    accuracy                           0.99       252
   macro avg       0.99      0.99      0.99       252
weighted avg       0.99      0.99      0.99       252



In [21]:
# Example Prediction
def predict_scam(dialogue):
    cleaned = clean_text(dialogue)
    vectorized = vectorizer.transform([cleaned])
    prob = model.predict_proba(vectorized)[0, 1]  # Probability of being scam
    return prob

sample_text = """I am selling some Taylor Swift tickets, are you interested?
I can give you a discount. Please send me your credit card details.
I will send you the tickets after payment, because I am a trusted seller.
If you don't act now, you will miss this opportunity!
"""
print("Scam Probability:", predict_scam(sample_text))

sample_text = """Hello, I am calling from the bank. We have detected some unusual activity on your account.
Please provide your account number and PIN so that we can verify your identity.
"""
print("Scam Probability:", predict_scam(sample_text))

sample_text = """Hi, I am calling from the local charity. We are raising funds for the homeless.
Would you like to make a donation to support our cause?
"""
print("Scam Probability:", predict_scam(sample_text))

Scam Probability: 0.7392390533645842
Scam Probability: 0.9447632534059093
Scam Probability: 0.5111092006115073
