In [None]:
import os
os.kill(os.getpid(), 9)


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install fuzzywuzzy




In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz, process
import pandas as pd

# Download all necessary NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Downloading required NLTK data...")
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    nltk.download('punkt_tab')
    print("NLTK data download completed.")

# Initialize NLTK components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize tokenizer and model
print("Initializing BERT model and tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load the model weights
try:
    model_path = "/content/drive/MyDrive/techNL_HSTIP/internship_selection_model.pt"
    model.load_state_dict(torch.load(model_path, map_location=device))
    print(f"Model loaded successfully from {model_path}")
except Exception as e:
    print(f"Error loading model: {str(e)}")
    raise

model.to(device)
model.eval()

def analyze_regional_priority(town):
    """Uses fuzzy matching to determine regional priority based on Avalon vs non-Avalon regions"""
    if pd.isna(town):
        return 0.0, "No location provided"

    town = str(town).lower()

    # Define Avalon region locations (standard priority)
    avalon_locations = {
        'st. john\'s', 'mount pearl', 'paradise', 'conception bay south',
        'torbay', 'portugal cove-st. philip\'s', 'flatrock', 'pouch cove',
        'bauline', 'witless bay', 'bay bulls', 'holyrood', 'seal cove',
        'fox trap', 'long pond', 'kelligrews', 'upper gullies', 'foxtrap',
        'topsail', 'chamberlains', 'logy bay', 'middle cove', 'outer cove',
        'petty harbour', 'maddox cove', 'goulds', 'kilbride', 'bell island',
        'wabana', 'placentia', 'st. bride\'s', 'dunville', 'freshwater'
    }

    best_match = process.extractOne(
        town,
        avalon_locations,
        scorer=fuzz.token_set_ratio,
        score_cutoff=80
    )

    if best_match:
        return 0.5, f"Standard priority - Avalon Peninsula region (matched with {best_match[0]})"
    else:
        return 1.0, f"High priority - Non-Avalon region ({town})"

def clean_and_lemmatize(text):
    """Clean and lemmatize text"""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

def calculate_diversity_score(student_data):
    """Calculate diversity score based on student data"""
    score = 0.0
    reasons = []

    # Gender diversity
    if student_data.get('identify_as', '').lower() in ['female', 'non-binary', 'prefer not to say']:
        score += 0.5
        reasons.append(f"Gender diversity factor: +0.5 ({student_data.get('identify_as')})")

    # Location diversity using regional priority
    location_score, location_reason = analyze_regional_priority(student_data.get('town', ''))
    score += location_score
    reasons.append(f"Location diversity factor: +{location_score} ({location_reason})")

    final_score = min(score, 1.0)
    return final_score, reasons

def get_detailed_reasoning(student_data, predicted_label, confidence, diversity_reasons):
    """Generate detailed reasoning based on the model's decision and student data"""

    def extract_significant_terms(text, min_word_length=4):
        """Extract significant terms from text, excluding common words"""
        if not isinstance(text, str):
            return []

        # Tokenize and clean the text
        text = text.lower()
        words = word_tokenize(text)

        # Filter out common words, short words, and non-alphabetic words
        significant_terms = [
            word for word in words
            if word not in stop_words
            and len(word) >= min_word_length
            and word.isalpha()
        ]

        return list(set(significant_terms))  # Remove duplicates

    def analyze_experience(tech_exp, non_tech_exp):
        """Analyze technical and non-technical experience based on actual content"""
        tech_terms = extract_significant_terms(tech_exp)
        non_tech_terms = extract_significant_terms(non_tech_exp)

        experience_analysis = {
            'technical_terms': tech_terms,
            'non_technical_terms': non_tech_terms,
            'tech_depth': len(tech_terms),
            'non_tech_depth': len(non_tech_terms)
        }

        return experience_analysis

    def analyze_motivation(why_internship, goals):
        """Analyze motivation and goals based on actual content"""
        motivation_terms = extract_significant_terms(why_internship)
        goal_terms = extract_significant_terms(goals)

        motivation_analysis = {
            'motivation_terms': motivation_terms,
            'goal_terms': goal_terms,
            'motivation_depth': len(motivation_terms),
            'goal_clarity': len(goal_terms)
        }

        return motivation_analysis

    # Perform detailed analysis
    experience_analysis = analyze_experience(
        student_data['tech_experience'],
        student_data.get('non_tech_experience', '')
    )

    motivation_analysis = analyze_motivation(
        student_data['why_internship'],
        student_data['goals']
    )

    # Generate reasoning
    reasoning = f"\nSelection Decision (Model is {'very confident' if confidence >= 0.80 else 'confident' if confidence >= 0.70 else 'moderately confident'} - {confidence*100:.2f}%)\n"
    reasoning += "="*80 + "\n"

    if predicted_label == 1:  # Selected
        reasoning += "SELECTED for internship\n\n"

        # Technical Analysis
        reasoning += "Technical Background Analysis:\n"
        reasoning += f"Key technical terms mentioned: {', '.join(experience_analysis['technical_terms'])}\n"
        reasoning += f"Technical depth indicators: {experience_analysis['tech_depth']} unique significant terms\n"

        # Motivation Analysis
        reasoning += "\nMotivation and Goals Analysis:\n"
        reasoning += f"Motivation indicators: {', '.join(motivation_analysis['motivation_terms'])}\n"
        reasoning += f"Career goal indicators: {', '.join(motivation_analysis['goal_terms'])}\n"

        # Non-technical Skills
        if experience_analysis['non_technical_terms']:
            reasoning += "\nTransferable Skills:\n"
            reasoning += f"Non-technical experience areas: {', '.join(experience_analysis['non_technical_terms'])}\n"

    else:  # Not Selected
        reasoning += "NOT SELECTED for internship\n\n"

        # Technical Analysis
        reasoning += "Technical Background Analysis:\n"
        if experience_analysis['tech_depth'] > 0:
            reasoning += f"Technical terms mentioned: {', '.join(experience_analysis['technical_terms'])}\n"
            reasoning += "While showing some technical knowledge, may need more depth or practical application\n"
        else:
            reasoning += "Limited technical terminology or specific technical experience mentioned\n"

        # Motivation Analysis
        reasoning += "\nMotivation and Goals Analysis:\n"
        if motivation_analysis['motivation_depth'] > 0:
            reasoning += f"Expressed interests: {', '.join(motivation_analysis['motivation_terms'])}\n"
            reasoning += f"Stated goals: {', '.join(motivation_analysis['goal_terms'])}\n"
        else:
            reasoning += "Could benefit from clearer expression of technical interests and goals\n"

    # Add diversity factors
    reasoning += "\nDiversity Considerations:\n"
    for reason in diversity_reasons:
        reasoning += f"- {reason}\n"

    # Original Text Analysis
    reasoning += "\nDetailed Application Review:\n"
    reasoning += "Technical Experience:\n"
    reasoning += f"\"{student_data['tech_experience']}\"\n"
    reasoning += f"Unique terms identified: {len(experience_analysis['technical_terms'])}\n"

    reasoning += "\nMotivation Statement:\n"
    reasoning += f"\"{student_data['why_internship']}\"\n"
    reasoning += f"Unique terms identified: {len(motivation_analysis['motivation_terms'])}\n"

    reasoning += "\nCareer Goals:\n"
    reasoning += f"\"{student_data['goals']}\"\n"
    reasoning += f"Unique terms identified: {len(motivation_analysis['goal_terms'])}\n"

    # Confidence Analysis
    reasoning += "\nDecision Confidence Analysis:\n"
    if confidence >= 0.80:
        reasoning += f"High confidence ({confidence*100:.2f}%) based on clear evidence in application\n"
        reasoning += f"Strong technical vocabulary ({experience_analysis['tech_depth']} terms) and "
        reasoning += f"clear motivation ({motivation_analysis['motivation_depth']} indicators)\n"
    elif confidence >= 0.70:
        reasoning += f"Moderate confidence ({confidence*100:.2f}%) with mixed indicators\n"
        reasoning += "Balance of strengths and areas for development identified\n"
    else:
        reasoning += f"Lower confidence ({confidence*100:.2f}%) requiring careful consideration\n"
        reasoning += "Multiple factors considered in final decision\n"

    return reasoning

def predict_for_student(sample_data):
    """Make prediction for a single student"""
    # Calculate diversity score with reasons
    diversity_score, diversity_reasons = calculate_diversity_score(sample_data)
    sample_data['diversity_score'] = diversity_score

    # Prepare input data
    fields_to_combine = [
        'town', 'diversity_score', 'why_internship',
        'tech_experience', 'non_tech_experience',
        'goals', 'other_comments'
    ]

    # Clean and combine fields
    cleaned_fields = []
    for field in fields_to_combine:
        if field == 'diversity_score':
            cleaned_fields.append(str(sample_data.get(field, 0.0)))
        else:
            cleaned_fields.append(clean_and_lemmatize(str(sample_data.get(field, ""))))

    combined_text = " ".join(cleaned_fields)

    # Tokenize
    inputs = tokenizer(
        combined_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    )

    # Move to appropriate device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        predicted_label = torch.argmax(probs, dim=1)[0].item()
        confidence = probs[0][predicted_label].item()

    # Generate reasoning
    detailed_reasoning = get_detailed_reasoning(sample_data, predicted_label, confidence, diversity_reasons)

    # Print full evaluation
    print("\nSTUDENT APPLICATION EVALUATION")
    print("="*80)
    print(f"Location: {sample_data.get('town', 'Not specified')}")
    print(f"Diversity Score: {diversity_score:.2f}")
    print(detailed_reasoning)

    return predicted_label, confidence, detailed_reasoning

# Test sample
sample_student = {
   'town': 'Bay Roberts',
        'identify_as': 'Female',  # Make sure to include this as it was in training
        'why_internship': 'I know the basic of coding and also made my portfolio using python, so i think this internship will help me to leverage my tech skills',
        'tech_experience': 'I have 2years of tech experience right now',
        'non_tech_experience': 'President of school debate club',
        'goals': 'I want to join MUN after high school for computer science field',
        'other_comments': 'Very excited about this opportunity!'
}

# Make prediction
if __name__ == "__main__":
    print("Loading model and making prediction...")
    predicted_label, confidence, reasoning = predict_for_student(sample_student)

[nltk_data] Downloading package punkt to /root/nltk_data...


Downloading required NLTK data...


[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK data download completed.
Using device: cpu
Initializing BERT model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully from /content/drive/MyDrive/techNL_HSTIP/internship_selection_model.pt
Loading model and making prediction...

STUDENT APPLICATION EVALUATION
Location: Bay Roberts
Diversity Score: 1.00

Selection Decision (Model is very confident - 81.92%)
NOT SELECTED for internship

Technical Background Analysis:
Technical terms mentioned: experience, tech, right
While showing some technical knowledge, may need more depth or practical application

Motivation and Goals Analysis:
Expressed interests: also, portfolio, python, think, tech, coding, internship, help, skills, know, basic, leverage, using, made
Stated goals: field, computer, want, school, high, science, join

Diversity Considerations:
- Gender diversity factor: +0.5 (Female)
- Location diversity factor: +1.0 (High priority - Non-Avalon region (bay roberts))

Detailed Application Review:
Technical Experience:
"I have 2years of tech experience right now"
Unique terms identified: 3

Motivation Statement:
"I know the b