<a href="https://colab.research.google.com/github/jbmalloy03/IT-745_Machine_Learning_course/blob/main/Copy_of_Copy_of_Third_party_NLP_Project_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# AI Third Party Risk Management project questionnaire for IT 745
# Install SteramLit
!pip install streamlit
!pip install python-docx PyPDF2

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m253.0/253.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.2.0


In [7]:
# app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import docx
import PyPDF2

# -----------------------------
# App Configuration
# -----------------------------
st.set_page_config(
    page_title="AI-Driven Third-Party Risk Dashboard",
    layout="wide"
)

st.title("AI-Driven Third-Party Risk Management Dashboard")
st.caption("Questionnaire + NLP-Based Policy Review + Risk Scoring")

# -----------------------------
# Helper Functions
# -----------------------------
def extract_text_from_file(uploaded_file):
    text = ""
    if uploaded_file.type == "application/pdf":
        reader = PyPDF2.PdfReader(uploaded_file)
        for page in reader.pages:
            text += page.extract_text() or ""
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        doc = docx.Document(uploaded_file)
        for para in doc.paragraphs:
            text += para.text + " "
    return text.lower()


def compute_similarity(policy_text, control_corpus):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform([policy_text] + control_corpus)
    similarity_scores = cosine_similarity(tfidf[0:1], tfidf[1:])
    return float(np.mean(similarity_scores))


def generate_llm_style_analysis(score):
    if score >= 0.65:
        return (
            "The documentation demonstrates strong alignment with expected security "
            "control language. Policies appear comprehensive and well-articulated, "
            "with evidence of governance and procedural maturity."
        )
    elif score >= 0.45:
        return (
            "The documentation shows partial alignment with expected security controls. "
            "Several areas may require clarification or expansion to improve completeness."
        )
    else:
        return (
            "The documentation shows limited alignment with expected security controls. "
            "Key governance, technical, or procedural elements appear missing or insufficient."
        )


def calculate_risk_score(questionnaire_scores, similarity_score):
    questionnaire_avg = np.mean(questionnaire_scores)
    combined_score = (0.6 * questionnaire_avg) + (0.4 * similarity_score * 100)

    if combined_score >= 75:
        risk = "Low Risk"
    elif combined_score >= 50:
        risk = "Moderate Risk"
    else:
        risk = "High Risk"

    return combined_score, risk


# -----------------------------
# Questionnaire Section
# -----------------------------
st.header("1. Third-Party Risk Questionnaire")

questions = {
    "Information Security Governance": st.slider("Security governance maturity", 1, 5, 3),
    "Access Control Management": st.slider("Access control effectiveness", 1, 5, 3),
    "Incident Response Readiness": st.slider("Incident response preparedness", 1, 5, 3),
    "Business Continuity Planning": st.slider("Business continuity planning", 1, 5, 3),
    "Data Protection & Privacy": st.slider("Data protection and privacy controls", 1, 5, 3),
}

questionnaire_scores = list(questions.values())

# -----------------------------
# Policy Upload Section
# -----------------------------
st.header("2. Upload Vendor Policy Documentation")

uploaded_file = st.file_uploader(
    "Upload policy document (PDF or DOCX)",
    type=["pdf", "docx"]
)

policy_text = ""
if uploaded_file:
    policy_text = extract_text_from_file(uploaded_file)
    st.success("Policy document successfully processed.")

# -----------------------------
# Control Baseline Corpus
# -----------------------------
control_corpus = [
    "access control policy user authentication authorization least privilege",
    "incident response plan detection containment eradication recovery",
    "data classification encryption retention privacy",
    "business continuity disaster recovery testing",
    "risk assessment vendor oversight governance"
]

# -----------------------------
# NLP + Risk Evaluation
# -----------------------------
st.header("3. Risk Analysis Results")

if st.button("Run Risk Assessment"):

    if not policy_text:
        st.warning("Please upload a policy document before running the assessment.")
    else:
        similarity_score = compute_similarity(policy_text, control_corpus)
        combined_score, risk_level = calculate_risk_score(
            questionnaire_scores,
            similarity_score
        )

        explanation = generate_llm_style_analysis(similarity_score)

        col1, col2, col3 = st.columns(3)

        col1.metric("NLP Similarity Score", f"{similarity_score:.2f}")
        col2.metric("Composite Risk Score", f"{combined_score:.1f}")
        col3.metric("Risk Classification", risk_level)

        st.subheader("AI-Assisted Policy Review Summary")
        st.write(explanation)

        # Results Table
        df = pd.DataFrame({
            "Domain": list(questions.keys()),
            "Score (1‚Äì5)": questionnaire_scores
        })

        st.subheader("Questionnaire Scores")
        st.dataframe(df, use_container_width=True)

        # Export option
        results_df = pd.DataFrame({
            "Metric": [
                "Average Questionnaire Score",
                "NLP Similarity Score",
                "Composite Risk Score",
                "Risk Classification"
            ],
            "Value": [
                np.mean(questionnaire_scores),
                similarity_score,
                combined_score,
                risk_level
            ]
        })

        csv = results_df.to_csv(index=False).encode("utf-8")
        st.download_button(
            label="Download Assessment Results (CSV)",
            data=csv,
            file_name="third_party_risk_results.csv",
            mime="text/csv"
        )

# -----------------------------
# Footer
# -----------------------------
st.markdown("---")
st.caption(
    "This dashboard supports academic research and decision support only. "
    "NLP outputs are explainable approximations and do not replace expert judgment."
)

2026-01-04 16:59:37.505 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [8]:
import streamlit as st
import pandas as pd
import json
from datetime import datetime

# Risk Questionnaire Configuration
# with open("risk_questionnaire_config.json", "r") as config_file:
#     config = json.load(config_file)
questions = {
    "q1": {"text": "Does the vendor have a formal cybersecurity policy?", "weight": 10},
    "q2": {"text": "Is data encrypted in transit and at rest?", "weight": 10},
    "q3": {"text": "Does the vendor perform regular vulnerability assessments?", "weight": 10},
    "q4": {"text": "Is multi-factor authentication (MFA) implemented?", "weight": 8},
    "q5": {"text": "Does the vendor comply with relevant regulations (e.g., GDPR, HIPAA)?", "weight": 10},
    "q6": {"text": "Does the vendor store or process sensitive data?", "weight": 7},
    "q7": {"text": "Does the vendor subcontract any critical services?", "weight": 8},
    "q8": {"text": "Is the vendor‚Äôs incident response plan tested annually?", "weight": 10},
    "q9": {"text": "Has the vendor experienced any recent data breaches?", "weight": 12},
    "q10": {"text": "Does the vendor provide employee cybersecurity training?", "weight": 5}

}

# Scoring and Risk Classification
def calculate_risk_score(responses, criticality):
    total_weight = sum(q["weight"] for q in questions.values())
    weighted_score = 0

    for key, response in responses.items():
        weight = questions[key]["weight"]
        if response == "Yes":
            weighted_score += weight
        elif response == "Partial":
            weighted_score += weight * 0.5

    score = (weighted_score / total_weight) * 100

    # Adjust for vendor criticality
    if criticality == "High":
        score *= 0.9
    elif criticality == "Low":
        score *= 1.1

    return min(100, max(0, score))


def classify_risk(score):
    if score < 50:
        return "High Risk"
    elif score < 75:
        return "Medium Risk"
    else:
        return "Low Risk"

        # AI-Based Recommendation Generator
def generate_recommendations(responses, risk_class):
    recs = []

    if risk_class == "High Risk":
        recs.append("Implement an immediate review of vendor cybersecurity practices and request evidence of remediation.")
        recs.append("Conduct an on-site or virtual audit focusing on access controls, encryption, and incident response.")
        recs.append("Require the vendor to establish a Service Level Agreement (SLA) for security monitoring and breach notification.")
    elif risk_class == "Medium Risk":
        recs.append("Request updated compliance certifications (e.g., SOC 2, ISO 27001) and review security documentation annually.")
        recs.append("Encourage the vendor to enhance MFA coverage and improve incident response testing frequency.")
    else:
        recs.append("Continue ongoing monitoring through annual assessments and periodic vulnerability scans.")
        recs.append("Maintain open communication with the vendor‚Äôs security team to ensure continuous compliance.")

    # Add AI-style adaptive insights based on answers
    if responses.get("q9") == "Yes":
        recs.append("Vendor has experienced a data breach ‚Äî ensure post-incident corrective actions and data protection measures are verified.")
    if responses.get("q7") == "Yes":
        recs.append("Vendor uses subcontractors ‚Äî verify that downstream providers adhere to equivalent security controls.")
    if responses.get("q10") == "No":
        recs.append("Lack of employee training identified ‚Äî recommend mandatory annual cybersecurity awareness training.")

    return recs

In [9]:
# Streamlit UI
# ==============================================
st.set_page_config(page_title="AI-Driven Third-Party Risk Assessment", layout="centered")

st.title("üß† AI-Driven Third-Party Risk Assessment Dashboard")
st.write("Assess vendor cybersecurity readiness and receive AI-generated recommendations for improvement.")

# Vendor Information
st.header("Vendor Information")
vendor = st.text_input("Vendor Name", "SampleVendorCo")
industry = st.text_input("Vendor Industry", "Finance")
criticality = st.selectbox("Vendor Criticality", ["High", "Medium", "Low"])

# Questionnaire
st.header("Risk Assessment Questionnaire")
responses = {}
for key, q in questions.items():
    responses[key] = st.radio(q["text"], ["Yes", "No", "Partial"], horizontal=True)

# Run Assessment
if st.button("Run AI-Driven Assessment"):
    score = calculate_risk_score(responses, criticality)
    risk_class = classify_risk(score)
    recommendations = generate_recommendations(responses, risk_class)
    timestamp = datetime.utcnow().isoformat() + "Z"

    result = {
        "vendor": vendor,
        "industry": industry,
        "criticality": criticality,
        "risk_score": round(score, 1),
        "risk_classification": risk_class,
        "recommendations": recommendations,
        "responses": responses,
        "timestamp": timestamp
    }

    # Results Display
    st.subheader("Assessment Results")
    st.metric("Risk Score", f"{round(score, 1)} / 100")
    st.metric("Risk Classification", risk_class)
    st.progress(int(score))

    st.write("###  AI-Generated Recommendations")
    for r in recommendations:
        st.markdown(f"- {r}")

    # Downloads
    json_data = json.dumps(result, indent=4)
    st.download_button(
        label="Download JSON Report",
        data=json_data,
        file_name=f"AI_TPRM_Report_{vendor}.json",
        mime="application/json"
    )

    df = pd.DataFrame([{
        "vendor": vendor,
        "industry": industry,
        "criticality": criticality,
        "risk_score": round(score, 1),
        "risk_classification": risk_class,
        "timestamp": timestamp
    }])
    csv_data = df.to_csv(index=False)
    st.download_button(
        label="Download CSV Summary",
        data=csv_data,
        file_name=f"AI_TPRM_Report_{vendor}.csv",
        mime="text/csv"
    )

st.markdown("---")
st.caption("Developed as part of AI-Driven Cybersecurity Research on Third-Party Risk Management ¬© 2025")

2026-01-04 17:00:22.406 Session state does not function when running a script without `streamlit run`


DeltaGenerator()

In [None]:
!npm install localtunnel

[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K
added 22 packages in 3s
[1G[0K‚†∏[1G[0K
[1G[0K‚†∏[1G[0K3 packages are looking for funding
[1G[0K‚†∏[1G[0K  run `npm fund` for details
[1G[0K‚†∏[1G[0K[1mnpm[22m [96mnotice[39m
[1mnpm[22m [96mnotice[39m New [31mmajor[39m version of npm available! [31m10.8.2[39m -> [34m11.6.4[39m
[1mnpm[22m [96mnotice[39m Changelog: [34mhttps://github.com/npm/cli/releases/tag/v11.6.4[39m
[1mnpm[22m [96mnotice[39m To update run: [4mnpm install -g npm@11.6.4[24m
[1mnpm[22m [96mnotice[39m
[1G[0K‚†∏[1G[0K

In [None]:
!curl https://loca.lt/mytunnelpassword

<!DOCTYPE html>
<html lang="en">
<body>
    <h1>503 Service Unavailable</h1>
    No server is available to handle this request.
</body>
</html>

In [10]:
!python -m streamlit run ai_tprm_dashboard_ai.py & npx localtunnel --port 8501

Usage: streamlit run [OPTIONS] [TARGET] [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: ai_tprm_dashboard_ai.py
[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20G^C
