## CSV Merged

In [5]:
import pandas as pd

# Load the two CSV files
df1 = pd.read_csv("bootcamps.csv")
df2 = pd.read_csv("translate_programs.csv")

# Concatenate the two dataframes
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save the result to a new CSV file
merged_df.to_csv("tuwaiq_data.csv", index=False)

In [9]:
merged_df.shape

(47, 13)

In [10]:
df1.shape

(15, 13)

In [11]:
df2.shape

(32, 13)

Combine and Clean Text

In [12]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load your data
df = pd.read_csv("tuwaiq_data.csv") 

# Combine the relevant fields
combined_fields = ['Title', 'Description', 'Category', 'Scope', 'Goals', 'Features', 'Requirements']
df['combined_text'] = df[combined_fields].astype(str).agg(' '.join, axis=1)

# Clean the text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # remove stopwords
    return text

df['cleaned_text'] = df['combined_text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ghada./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## BERT Embeddings

Bootcamp & Programs

In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert to string and clean nulls just to be safe
df['cleaned_text'] = df['cleaned_text'].astype(str)

# Encode all bootcamps at once
bootcamp_embeddings = model.encode(df['cleaned_text'].tolist(), show_progress_bar=True)

# Check shape
print("Bootcamp Embeddings shape:", np.array(bootcamp_embeddings).shape)

Batches: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]

Bootcamp Embeddings shape: (47, 384)





CV

In [15]:
cv_text = """
Name: Ameen Alrashid
Email: ameen.alrashid@email.com
Phone: +966-5XXXXXXX
Location: Dammam, Saudi Arabia
LinkedIn: linkedin.com/in/ameenalrashid
GitHub: github.com/ameenalrashid

Professional Summary:
Cybersecurity analyst with a strong background in securing enterprise systems, performing risk assessments, and implementing defense strategies. Experienced in monitoring network activity, mitigating threats, and enhancing information security compliance. Passionate about protecting digital infrastructure and staying current with emerging security trends.

Education:
Bachelor of Science in Computer Science – Imam Abdulrahman Bin Faisal University
Graduation: May 2024

Certifications:
- CompTIA Security+
- Certified Ethical Hacker (CEH)
- IBM Cybersecurity Analyst Professional Certificate

Technical Skills:
- Security Tools: Snort, Nmap, Nessus, Splunk
- Languages: Python, PowerShell, JavaScript
- Networking: TCP/IP, DNS, VPN, NAT, OSI Model
- Platforms: Windows Server, Kali Linux, Ubuntu
- Practices: Penetration Testing, SIEM, Incident Response, Threat Hunting

Projects:
Web App Penetration Testing Simulation
- Identified and exploited OWASP Top 10 vulnerabilities in a test application
- Documented flaws and provided remediation strategies
- Demonstrated XSS, SQLi, and CSRF attacks in a live demo

Security Information and Event Management (SIEM) Dashboard
- Built custom SIEM dashboard using Splunk
- Created real-time alerts for unusual login patterns and port scans
- Reduced detection time for incidents by 40%

Experience:
Cybersecurity Intern – Aramco Cyber Defense Center
Jul 2023 – Sep 2023
- Conducted internal vulnerability scans and documented findings
- Analyzed phishing attempts and contributed to monthly threat reports
- Assisted in deploying endpoint detection and response (EDR) solutions

Languages:
- Arabic: Native
- English: Professional Proficiency

Interests:
- Capture the Flag (CTF) challenges
- Malware analysis
- Threat intelligence platforms

References:
Available upon request
"""

# Result shape: (1, 384)
cv_embedding = model.encode([cv_text])  

print("CV Embedding shape:", np.array(cv_embedding).shape)

CV Embedding shape: (1, 384)


## BERT + Cosine Similarity Matching Engine

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [17]:
# cv_embedding: shape (384,) → reshape to (1, 384)
similarities = cosine_similarity(cv_embedding.reshape(1, -1), bootcamp_embeddings)

# Get scores for each bootcamp
similarities = similarities.flatten()

# Rank bootcamps by similarity (highest first)
top_indices = np.argsort(similarities)[::-1]

In [18]:
top_n = 5
for i in range(top_n):
    index = top_indices[i]
    score = similarities[index]
    title = df.iloc[index]['Title']
    print(f"{i+1}. {title} (Score: {score:.3f})")

1. Security+ Information Security (Score: 0.631)
2. Cybersecurity (Score: 0.628)
3. Cybersecurity Analyst CySA+ (Score: 0.605)
4. Cybersecurity Fundamentals (Score: 0.603)
5. Cybersecurity Defense L2 - SOC (Score: 0.586)
