In [1]:
# Import libraries
import pandas as pd
import spacy
from google.colab import files

# Step 1: Load spaCy model
print("Loading spaCy model...")
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Step 2: Upload datasets
print("Please upload the required dataset: 'Sample_Calls_Dataset_for_Content_Analysis.csv'")
uploaded = files.upload()

# Step 3: Load dataset
sample_calls_df = pd.read_csv('Sample_Calls_Dataset_for_Content_Analysis.csv')

# Step 4: Define classification function
def classify_content(transcript):
    """
    Classify the content of a call transcript into urgency, threats, or sensitive information.
    """
    doc = nlp(transcript.lower())

    # Define keyword sets
    urgency_keywords = {'urgent', 'immediate', 'asap', 'important', 'action required', 'now'}
    threat_keywords = {'final notice', 'penalty', 'collection', 'legal action', 'threat'}
    sensitive_info_keywords = {'bank', 'account', 'password', 'social security', 'confidential', 'share'}

    # Initialize categories
    categories = []

    # Classify urgency
    if any(token.text in urgency_keywords for token in doc):
        categories.append('urgency')

    # Classify threats
    if any(token.text in threat_keywords for token in doc):
        categories.append('threats')

    # Classify sensitive information requests
    if any(token.text in sensitive_info_keywords for token in doc):
        categories.append('sensitive information')

    # Join categories into a single string
    return ', '.join(categories) if categories else 'none'

# Step 5: Apply classification to the dataset
sample_calls_df['Content'] = sample_calls_df['Call_Transcript'].apply(classify_content)

# Step 6: Save and download results
output_file = 'Classified_Content_Results.csv'
sample_calls_df.to_csv(output_file, index=False)

# Display and download the results
print("Processed DataFrame Preview:")
print(sample_calls_df.head())

print(f"Results saved to '{output_file}'. Downloading the file...")
files.download(output_file)


Loading spaCy model...
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Please upload the required dataset: 'Sample_Calls_Dataset_for_Content_Analysis.csv'


Saving Sample_Calls_Dataset_for_Content_Analysis.csv to Sample_Calls_Dataset_for_Content_Analysis.csv
Saving Vishing_Keywords.csv to Vishing_Keywords.csv
Processed DataFrame Preview:
   Call_ID                                    Call_Transcript Expected_Result  \
0        1  Immediate action required to avoid penalties. ...         Vishing   
1        2  You have won a lottery prize! Share your bank ...         Vishing   
2        3  Your account has been flagged for suspicious a...         Vishing   
3        4  This is a final notice from collections. Share...     Non-Vishing   
4        5  This is a final notice from collections. Share...     Non-Vishing   

                 Content  
0                urgency  
1  sensitive information  
2  sensitive information  
3  sensitive information  
4  sensitive information  
Results saved to 'Classified_Content_Results.csv'. Downloading the file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
# Import libraries
import pandas as pd
import spacy
from google.colab import files

# Step 1: Load spaCy model
print("Loading spaCy model...")
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Step 2: Upload datasets
print("Please upload the required datasets: 'Sample_Calls_Dataset_for_Content_Analysis.csv' and 'Vishing_Keywords.csv'")
uploaded = files.upload()

# Step 3: Load datasets
sample_calls_df = pd.read_csv('Sample_Calls_Dataset_for_Content_Analysis.csv')
vishing_keywords_df = pd.read_csv('Vishing_Keywords.csv')

# Step 4: Preprocess keywords
vishing_keywords_df['Keyword'] = (
    vishing_keywords_df['Keyword']
    .str.lower()
    .str.replace(r'[^a-zA-Z\s]', '', regex=True)
    .str.strip()
)
vishing_keywords_set = set(vishing_keywords_df['Keyword'])

# Step 5: Define content analysis function with percentage-based scoring
def analyze_content_percentage(transcript, vishing_keywords):
    """
    Analyze call transcript and return a percentage score indicating vishing likelihood.
    """
    # Preprocessing transcript
    tokens = transcript.lower().split()
    total_tokens = len(tokens) if len(tokens) > 0 else 1  # Avoid division by zero

    # Keyword Matching
    keyword_matches = sum(1 for token in tokens if token in vishing_keywords)

    # NLP Analysis
    doc = nlp(transcript)
    urgency_matches = sum(1 for token in doc if token.text.lower() in {'urgent', 'important', 'immediate'})
    threat_matches = sum(1 for sent in doc.sents if 'threat' in sent.text.lower() or 'sensitive' in sent.text.lower())

    # Calculate total score as the sum of matched features
    total_score = keyword_matches + urgency_matches + threat_matches

    # Normalize score as a percentage of total tokens
    content_percentage = (total_score / total_tokens) * 100
    return round(content_percentage, 2)

# Step 6: Calculate Content Score as a percentage
sample_calls_df['Content_Score_Percentage'] = sample_calls_df['Call_Transcript'].apply(
    lambda transcript: analyze_content_percentage(transcript, vishing_keywords_set)
)

# Define threshold for classification based on percentage
percentage_threshold = 8.0  # Example: If score >= 8%, classify as Vishing

def classify_call_percentage(content_score_percentage):
    return "Vishing" if content_score_percentage >= percentage_threshold else "Non-Vishing"

sample_calls_df['Predicted_Result'] = sample_calls_df['Content_Score_Percentage'].apply(classify_call_percentage)

# Step 7: Save and download results with both columns included
output_file = 'Content_Analysis_Percentage_Results.csv'

# Keep the necessary columns
columns_to_include = ['Call_Transcript', 'Content_Score_Percentage', 'Predicted_Result']
cleaned_sample_calls_df = sample_calls_df[columns_to_include]

# Save the cleaned DataFrame
cleaned_sample_calls_df.to_csv(output_file, index=False)

# Display and download the results
print("Processed DataFrame Preview (with Content_Score_Percentage and Predicted_Result):")
print(cleaned_sample_calls_df.head())

print(f"Results saved to '{output_file}'. Downloading the file...")
files.download(output_file)


Loading spaCy model...
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Please upload the required datasets: 'Sample_Calls_Dataset_for_Content_Analysis.csv' and 'Vishing_Keywords.csv'


Saving Vishing_Keywords.csv to Vishing_Keywords (1).csv
Saving Sample_Calls_Dataset_for_Content_Analysis - Copy.csv to Sample_Calls_Dataset_for_Content_Analysis - Copy.csv
Processed DataFrame Preview (with Content_Score_Percentage and Predicted_Result):
                                     Call_Transcript  \
0  Immediate action required to avoid penalties. ...   
1  You have won a lottery prize! Share your bank ...   
2  Your account has been flagged for suspicious a...   
3  This is a final notice from collections. Share...   
4  This is a final notice from collections. Share...   

   Content_Score_Percentage Predicted_Result  
0                      9.09          Vishing  
1                      8.33          Vishing  
2                      7.14      Non-Vishing  
3                      6.67      Non-Vishing  
4                      6.67      Non-Vishing  
Results saved to 'Content_Analysis_Percentage_Results.csv'. Downloading the file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>