In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from transformers import BertTokenizer, BertModel
import torch


In [2]:
# Load dataset
df = pd.read_csv("cleaned_call_logs.csv")

In [3]:
# Preprocessing function
def clean_text(text):
    text = re.sub(r"caller:|receiver:", "", text, flags=re.IGNORECASE)  # Remove labels
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df["cleaned_dialogue"] = df["dialogue"].apply(clean_text)

In [8]:
# 1. CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
X_count = count_vectorizer.fit_transform(df["cleaned_dialogue"])

# 2. TF-IDF (current approach)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df["cleaned_dialogue"])

# 3. Word2Vec
# Prepare sentences for Word2Vec
sentences = [text.split() for text in df["cleaned_dialogue"]]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

# Create document vectors by averaging word vectors
X_w2v = np.zeros((len(sentences), 100))
for idx, sentence in enumerate(sentences):
    vectors = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv]
    if vectors:
        X_w2v[idx] = np.mean(vectors, axis=0)

In [10]:
# Split data for each feature extraction method
X_train_count, X_test_count, y_train_count, y_test_count = train_test_split(X_count, df["labels"], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, df["labels"], test_size=0.2, random_state=42)
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_w2v, df["labels"], test_size=0.2, random_state=42)

In [13]:
# Train Logistic Regression model
model_count = LogisticRegression(max_iter=1000)
model_count.fit(X_train_count, y_train_count)
model_tfidf = LogisticRegression(max_iter=1000)
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
model_w2v = LogisticRegression(max_iter=1000)
model_w2v.fit(X_train_w2v, y_train_w2v)

# Evaluate models
y_pred_count = model_count.predict(X_test_count)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
y_pred_w2v = model_w2v.predict(X_test_w2v)

print("CountVectorizer:")
print("Accuracy:", accuracy_score(y_test_count, y_pred_count))
print(classification_report(y_test_count, y_pred_count))

print("TF-IDF:")
print("Accuracy:", accuracy_score(y_test_tfidf, y_pred_tfidf))
print(classification_report(y_test_tfidf, y_pred_tfidf))

print("Word2Vec:")
print("Accuracy:", accuracy_score(y_test_w2v, y_pred_w2v))
print(classification_report(y_test_w2v, y_pred_w2v))

CountVectorizer:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96
           1       1.00      1.00      1.00       156

    accuracy                           1.00       252
   macro avg       1.00      1.00      1.00       252
weighted avg       1.00      1.00      1.00       252

TF-IDF:
Accuracy: 0.9920634920634921
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        96
           1       0.99      1.00      0.99       156

    accuracy                           0.99       252
   macro avg       0.99      0.99      0.99       252
weighted avg       0.99      0.99      0.99       252

Word2Vec:
Accuracy: 0.9722222222222222
              precision    recall  f1-score   support

           0       0.95      0.98      0.96        96
           1       0.99      0.97      0.98       156

    accuracy                           0.97       252
   macro avg       0.

In [48]:
new_dialogue = """
caller: Hi, this is Sarah from XYZ Internet. How can I assist you today?  
receiver: I’m having trouble with my Wi-Fi. It keeps disconnecting.  
caller: I see. Let me run a diagnostic on your line. Can you confirm your account number?  
receiver: Sure, it's 987654321.  
caller: Thank you. I see some signal issues. I will reset your connection remotely.  
receiver: Great, thanks!  
caller: No problem! If the issue persists, feel free to call us back. Have a great day!  
"""
new_dialogue = clean_text(new_dialogue)  # Apply your text preprocessing

# Convert text into a sparse matrix using CountVectorizer
X_new = count_vectorizer.transform([new_dialogue])

# Get the predicted label
y_new = model_count.predict(X_new)

# Get probability estimates
prob_new = model_count.predict_proba(X_new)

# Ensure correct interpretation of prediction
label_index = np.argmax(prob_new)  # Find index of max probability
label = "Scam" if label_index == 1 else "Not Scam"

print("Predicted label:", label)
print("Predicted label probability:", prob_new[0][label_index])

Predicted label: Scam
Predicted label probability: 0.9240601526777773


In [49]:
# Use the Word2Vec Logistic Regression model to predict the label and the probability of the label for the following dialogue
dialogue = """
caller: Hi, this is Sarah from XYZ Internet. How can I assist you today?  
receiver: I’m having trouble with my Wi-Fi. It keeps disconnecting.  
caller: I see. Let me run a diagnostic on your line. Can you confirm your account number?  
receiver: Sure, it's 987654321.  
caller: Thank you. I see some signal issues. I will reset your connection remotely.  
receiver: Great, thanks!  
caller: No problem! If the issue persists, feel free to call us back. Have a great day!  
"""

cleaned_dialogue = clean_text(dialogue)
sentence = cleaned_dialogue.split()
vectors = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv]
if vectors:
    X = np.mean(vectors, axis=0)
    X = X.reshape(1, -1)
    y_pred = model_w2v.predict(X)
    y_pred_proba = model_w2v.predict_proba(X)
    label_index = np.argmax(y_pred_proba)  # Get index of highest probability
    label = "Scam" if label_index == 1 else "Not Scam"
    print("Predicted label:", label)
    print("Predicted label probability:", y_pred_proba[0][label_index])
else:
    print("No word vectors found for the dialogue")


Predicted label: Not Scam
Predicted label probability: 0.8790021644487198


In [52]:
# Use the TFIDF Logistic Regression model to predict the label and the probability of the label for the following dialogue
dialogue = """
caller: Hi, this is Sarah from XYZ Internet. How can I assist you today?  
receiver: I’m having trouble with my Wi-Fi. It keeps disconnecting.  
caller: I see. Let me run a diagnostic on your line. Can you confirm your account number?  
receiver: Sure, it's 987654321.  
caller: Thank you. I see some signal issues. I will reset your connection remotely.  
receiver: Great, thanks!  
caller: No problem! If the issue persists, feel free to call us back. Have a great day!
"""
cleaned_dialogue = clean_text(dialogue)
X = tfidf_vectorizer.transform([cleaned_dialogue])
y_pred = model_tfidf.predict(X)
y_pred_proba = model_tfidf.predict_proba(X)
label_index = np.argmax(y_pred_proba)  # Get index of highest probability
label = "Scam" if label_index == 1 else "Not Scam"
print("Predicted label:", label)
print("Predicted label probability:", y_pred_proba[0][label_index])


Predicted label: Not Scam
Predicted label probability: 0.6004718112848557
