In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from datetime import datetime
from typing import List, Dict, Any, Optional, Union

class ProfileMatcher:
    def __init__(self, profiles: Dict[str, Dict[str, Any]]):
        """
        Initialize the ProfileMatcher with a dictionary of user profiles.

        Parameters:
        profiles (dict): Dictionary of user profile objects keyed by user ID
        """
        self.raw_profiles = profiles
        self.profiles = []

        # Convert to list format for processing
        for user_id, profile_data in profiles.items():
            profile = {
                'id': user_id,
                'name': profile_data.get('fullName', ''),
                'industry': profile_data.get('industry', ''),
                'skills': profile_data.get('skills', []),
                'bio': profile_data.get('bio', ''),
                'education': profile_data.get('education', []),
                'certifications': profile_data.get('certifications', []),
                'innovations': profile_data.get('innovations', []),
                'packages': profile_data.get('packages', []),
                'looking_for': self._extract_looking_for(profile_data.get('infoItems', [])),
                'availability': profile_data.get('availableDates', [])
            }
            self.profiles.append(profile)

        self.profile_df = pd.DataFrame(self.profiles)

    def _extract_looking_for(self, info_items: List[Dict[str, str]]) -> List[str]:
        """Extract 'Looking for' information from infoItems"""
        for item in info_items:
            if item.get('title', '').lower() == 'looking for':
                content = item.get('content', '')
                # Split by common separators and clean up
                return [x.strip() for x in content.replace(',', ' ').replace('and', ' ').split() if x.strip()]
        return []

    def _extract_technologies(self, innovations: List[Dict[str, Any]]) -> List[str]:
        """Extract all technologies from innovations"""
        techs = []
        for innovation in innovations:
            techs.extend(innovation.get('technologies', []))
        return list(set(techs))  # Remove duplicates

    def _get_education_level(self, education: List[Dict[str, str]]) -> int:
        """Determine education level score"""
        if not education:
            return 0

        highest_level = 0
        for edu in education:
            degree = edu.get('degree', '').lower()
            if 'ph.d' in degree or 'doctor' in degree:
                highest_level = max(highest_level, 4)
            elif 'master' in degree or 'ms' in degree or 'ma' in degree or 'mba' in degree:
                highest_level = max(highest_level, 3)
            elif 'bachelor' in degree or 'ba' in degree or 'bs' in degree:
                highest_level = max(highest_level, 2)
            else:
                highest_level = max(highest_level, 1)

        return highest_level

    def preprocess_profiles(self):
        """Preprocess profile data for matching"""
        # Create combined text field for content-based matching
        self.profile_df['technologies'] = self.profile_df['innovations'].apply(self._extract_technologies)

        # Create text data for similarity matching
        self.profile_df['combined_text'] = ''

        # Add bio text
        self.profile_df['combined_text'] += self.profile_df['bio'].fillna('') + ' '

        # Add skills
        self.profile_df['combined_text'] += self.profile_df['skills'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else ''
        ) + ' '

        # Add industry
        self.profile_df['combined_text'] += self.profile_df['industry'].fillna('') + ' '

        # Add technologies from innovations
        self.profile_df['combined_text'] += self.profile_df['technologies'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else ''
        )

        # Add innovation titles and descriptions
        self.profile_df['combined_text'] += self.profile_df['innovations'].apply(
            lambda innovations: ' '.join([
                f"{inv.get('title', '')} {inv.get('description', '')}"
                for inv in innovations
            ]) if isinstance(innovations, list) else ''
        )

        # Vectorize the text data
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.text_matrix = self.vectorizer.fit_transform(self.profile_df['combined_text'])

        # Calculate education levels
        self.profile_df['education_level'] = self.profile_df['education'].apply(self._get_education_level)

        # Normalize timestamps for availability
        self.profile_df['availability_dates'] = self.profile_df['availability'].apply(
            lambda dates: [d if isinstance(d, str) else d.strftime('%Y-%m-%d') if hasattr(d, 'strftime') else ''
                          for d in dates] if isinstance(dates, list) else []
        )

    def calculate_skill_match(self, skills1: List[str], skills2: List[str]) -> float:
        """Calculate match score based on skills overlap"""
        if not skills1 or not skills2:
            return 0.0

        # Convert to lowercase for case-insensitive matching
        skills1_lower = [s.lower() for s in skills1]
        skills2_lower = [s.lower() for s in skills2]

        # Find overlapping skills
        overlap = set(skills1_lower).intersection(set(skills2_lower))

        # Calculate Jaccard similarity
        union = set(skills1_lower).union(set(skills2_lower))
        if not union:
            return 0.0

        return len(overlap) / len(union)

    def calculate_innovation_match(self, innovations1: List[Dict], innovations2: List[Dict]) -> float:
        """Calculate match based on innovation compatibility"""
        if not innovations1 or not innovations2:
            return 0.3  # Default neutral score

        # Extract technologies from both
        tech1 = set()
        for inv in innovations1:
            if 'technologies' in inv and isinstance(inv['technologies'], list):
                tech1.update([t.lower() for t in inv['technologies']])

        tech2 = set()
        for inv in innovations2:
            if 'technologies' in inv and isinstance(inv['technologies'], list):
                tech2.update([t.lower() for t in inv['technologies']])

        # Calculate technology overlap
        if not tech1 or not tech2:
            return 0.3

        overlap = tech1.intersection(tech2)
        return len(overlap) / max(len(tech1), len(tech2))

    def calculate_availability_match(self, dates1: List, dates2: List) -> float:
        """Calculate match based on overlapping availability"""
        if not dates1 or not dates2:
            return 0.5  # Neutral score if availability isn't specified

        # Convert to string format if needed
        dates1_str = [d if isinstance(d, str) else d.strftime('%Y-%m-%d') if hasattr(d, 'strftime') else str(d)
                      for d in dates1]
        dates2_str = [d if isinstance(d, str) else d.strftime('%Y-%m-%d') if hasattr(d, 'strftime') else str(d)
                      for d in dates2]

        # Find overlapping dates
        overlap = set(dates1_str).intersection(set(dates2_str))

        if overlap:
            return 1.0  # Perfect match if there's at least one overlapping date
        else:
            return 0.2  # Low score if no overlapping dates

    def calculate_looking_for_match(self, profile1: Dict, profile2: Dict) -> float:
        """Calculate match based on what users are looking for"""
        looking_for1 = profile1.get('looking_for', [])
        looking_for2 = profile2.get('looking_for', [])

        if not looking_for1 and not looking_for2:
            return 0.5  # Neutral score

        # Check if profile2's industry or skills appear in profile1's looking_for
        industry2 = profile2.get('industry', '').lower()

        # Check if industry is mentioned in looking_for
        industry_match = any(industry2 in lf.lower() for lf in looking_for1 if isinstance(lf, str))

        # Check if skills are mentioned in looking_for
        skills2 = [s.lower() for s in profile2.get('skills', [])]
        skill_match = any(
            any(skill in lf.lower() for skill in skills2)
            for lf in looking_for1 if isinstance(lf, str)
        )

        # Check the reverse direction
        industry1 = profile1.get('industry', '').lower()
        industry_match_reverse = any(industry1 in lf.lower() for lf in looking_for2 if isinstance(lf, str))

        skills1 = [s.lower() for s in profile1.get('skills', [])]
        skill_match_reverse = any(
            any(skill in lf.lower() for skill in skills1)
            for lf in looking_for2 if isinstance(lf, str)
        )

        # Calculate score based on matches
        if industry_match or industry_match_reverse:
            return 1.0
        elif skill_match or skill_match_reverse:
            return 0.8
        else:
            # Check for keywords like "collaboration" or "partnership"
            collaboration_keywords = ['collaborat', 'partner', 'work with', 'team up']
            has_collab_keywords1 = any(
                any(keyword in lf.lower() for keyword in collaboration_keywords)
                for lf in looking_for1 if isinstance(lf, str)
            )
            has_collab_keywords2 = any(
                any(keyword in lf.lower() for keyword in collaboration_keywords)
                for lf in looking_for2 if isinstance(lf, str)
            )

            return 0.6 if has_collab_keywords1 or has_collab_keywords2 else 0.3

    def calculate_education_match(self, edu_level1: int, edu_level2: int) -> float:
        """Calculate match based on education complementarity"""
        if edu_level1 == 0 or edu_level2 == 0:
            return 0.5  # Neutral if education not specified

        # Perfect match for equal education levels
        if edu_level1 == edu_level2:
            return 0.8

        # Good match for complementary education (e.g. high + medium)
        if abs(edu_level1 - edu_level2) == 1:
            return 0.7

        # Less ideal but still relevant match
        return 0.5

    def generate_match_reason(self, profile1: Dict, profile2: Dict,
                             text_similarity: float, skill_score: float,
                             innovation_score: float, looking_for_score: float) -> List[str]:
        """Generate human-readable reasons for the match"""
        reasons = []

        # Bio and industry similarity
        if text_similarity > 0.5:
            if profile1.get('industry') == profile2.get('industry'):
                reasons.append(f"Both work in the {profile1.get('industry')} industry")
            else:
                reasons.append("Similar professional backgrounds")

        # Skill complementarity
        if skill_score > 0.3:
            shared_skills = set([s.lower() for s in profile1.get('skills', [])])
            shared_skills = shared_skills.intersection([s.lower() for s in profile2.get('skills', [])])
            if shared_skills:
                top_skills = list(shared_skills)[:3]
                reasons.append(f"Shared expertise in: {', '.join(top_skills)}")

        # Innovation compatibility
        if innovation_score > 0.5:
            tech1 = set()
            for inv in profile1.get('innovations', []):
                if 'technologies' in inv and isinstance(inv['technologies'], list):
                    tech1.update([t.lower() for t in inv['technologies']])

            tech2 = set()
            for inv in profile2.get('innovations', []):
                if 'technologies' in inv and isinstance(inv['technologies'], list):
                    tech2.update([t.lower() for t in inv['technologies']])

            shared_techs = tech1.intersection(tech2)
            if shared_techs:
                top_techs = list(shared_techs)[:3]
                reasons.append(f"Complementary technologies: {', '.join(top_techs)}")

        # Looking for match
        looking_for1 = profile1.get('looking_for', [])
        looking_for2 = profile2.get('looking_for', [])

        if looking_for1 and looking_for_score > 0.7:
            industry2 = profile2.get('industry', '').lower()
            if any(industry2 in lf.lower() for lf in looking_for1 if isinstance(lf, str)):
                reasons.append(f"{profile1.get('name')} is looking for expertise in {profile2.get('industry')}")

        if looking_for2 and looking_for_score > 0.7:
            industry1 = profile1.get('industry', '').lower()
            if any(industry1 in lf.lower() for lf in looking_for2 if isinstance(lf, str)):
                reasons.append(f"{profile2.get('name')} is looking for expertise in {profile1.get('industry')}")

        # Check for availability overlap
        dates1 = profile1.get('availability', [])
        dates2 = profile2.get('availability', [])

        if dates1 and dates2:
            dates1_str = [d if isinstance(d, str) else d.strftime('%Y-%m-%d') if hasattr(d, 'strftime') else str(d)
                          for d in dates1]
            dates2_str = [d if isinstance(d, str) else d.strftime('%Y-%m-%d') if hasattr(d, 'strftime') else str(d)
                          for d in dates2]

            overlap = set(dates1_str).intersection(set(dates2_str))
            if overlap:
                reasons.append(f"Both available on: {', '.join(sorted(list(overlap))[:2])}")

        # Check for innovation potential
        if profile1.get('innovations') and profile2.get('innovations'):
            p1_titles = [inv.get('title', '') for inv in profile1.get('innovations', [])]
            p2_titles = [inv.get('title', '') for inv in profile2.get('innovations', [])]

            if p1_titles and p2_titles:
                reasons.append("Potential for innovation collaboration")

        # Check for mentorship potential based on education
        edu_level1 = self._get_education_level(profile1.get('education', []))
        edu_level2 = self._get_education_level(profile2.get('education', []))

        if edu_level1 > edu_level2 and edu_level1 >= 3:
            reasons.append(f"Mentorship potential: {profile1.get('name')} can mentor {profile2.get('name')}")
        elif edu_level2 > edu_level1 and edu_level2 >= 3:
            reasons.append(f"Mentorship potential: {profile2.get('name')} can mentor {profile1.get('name')}")

        # If no specific reasons, give a generic one based on overall match
        if not reasons:
            if text_similarity > 0.3:
                reasons.append("Overall profile compatibility based on background and interests")
            else:
                reasons.append("Potential networking opportunity with different perspectives")

        return reasons

    def find_matches(self, profile_id: str, top_n: int = 5) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
        """
        Find the top N matches for a specific profile

        Parameters:
        profile_id (str): ID of the profile to match
        top_n (int): Number of top matches to return

        Returns:
        list: List of dictionaries containing match info
        """
        # Preprocess if not done already
        if not hasattr(self, 'text_matrix'):
            self.preprocess_profiles()

        # Find the profile
        try:
            profile_idx = self.profile_df[self.profile_df['id'] == profile_id].index[0]
        except IndexError:
            return {"error": f"Profile with ID {profile_id} not found"}

        profile = self.profile_df.iloc[profile_idx].to_dict()

        # Calculate text similarity scores
        text_similarities = cosine_similarity(self.text_matrix[profile_idx:profile_idx+1], self.text_matrix).flatten()

        # Create results dataframe
        results = pd.DataFrame({
            'index': range(len(self.profile_df)),
            'id': self.profile_df['id'],
            'name': self.profile_df['name'],
            'text_similarity': text_similarities
        })

        # Remove self from results
        results = results[results['id'] != profile_id]

        # Calculate additional match scores
        match_scores = []
        for idx, row in results.iterrows():
            other_profile = self.profile_df.iloc[row['index']].to_dict()

            # Skill match
            skill_score = self.calculate_skill_match(
                profile.get('skills', []),
                other_profile.get('skills', [])
            )

            # Innovation match
            innovation_score = self.calculate_innovation_match(
                profile.get('innovations', []),
                other_profile.get('innovations', [])
            )

            # Looking for match
            looking_for_score = self.calculate_looking_for_match(profile, other_profile)

            # Availability match
            availability_score = self.calculate_availability_match(
                profile.get('availability', []),
                other_profile.get('availability', [])
            )

            # Education match
            education_score = self.calculate_education_match(
                profile.get('education_level', 0),
                other_profile.get('education_level', 0)
            )

            # Calculate final match score (weighted average)
            final_score = (
                text_similarities[row['index']] * 0.3 +  # Text similarity
                skill_score * 0.25 +                    # Skill match
                innovation_score * 0.15 +               # Innovation compatibility
                looking_for_score * 0.15 +              # Looking for match
                availability_score * 0.10 +             # Availability overlap
                education_score * 0.05                  # Education match
            )

            match_scores.append({
                'id': row['id'],
                'skill_score': skill_score,
                'innovation_score': innovation_score,
                'looking_for_score': looking_for_score,
                'availability_score': availability_score,
                'education_score': education_score,
                'final_score': final_score
            })

        # Add scores to results
        match_scores_df = pd.DataFrame(match_scores)
        results = results.merge(match_scores_df, on='id')

        # Sort by final match score
        results = results.sort_values('final_score', ascending=False).head(top_n)

        # Generate match reasons
        matches = []
        for _, row in results.iterrows():
            match_profile = self.profile_df.iloc[row['index']].to_dict()
            reasons = self.generate_match_reason(
                profile,
                match_profile,
                row['text_similarity'],
                row['skill_score'],
                row['innovation_score'],
                row['looking_for_score']
            )

            matches.append({
                'id': row['id'],
                'name': match_profile.get('name', ''),
                'industry': match_profile.get('industry', ''),
                'match_score': round(row['final_score'] * 100),  # Convert to percentage
                'match_reasons': reasons
            })

        return matches

    def find_all_matches(self, top_n: int = 5) -> Dict[str, List[Dict[str, Any]]]:
        """Find matches for all profiles in the dataset"""
        results = {}
        for profile_id in self.profile_df['id']:
            results[profile_id] = self.find_matches(profile_id, top_n)
        return results


# Example usage
if __name__ == "__main__":
    # Sample profiles based on provided format
    sample_profiles = {
        "johnsmith": {
            "fullName": "Alex Chen",
            "industry": "AI & Robotics Innovation",
            "skills": ["Machine Learning", "Robotics", "Computer Vision", "Python", "ROS"],
            "bio": "AI and robotics innovator specializing in autonomous systems. Founder of RoboAI Solutions, developing next-generation healthcare and manufacturing robotics. Previously led R&D at Boston Dynamics with focus on human-robot interaction.",
            "education": [
                {
                  "institution": "Massachusetts Institute of Technology",
                  "degree": "Ph.D. in Robotics Engineering",
                  "startDate": "2015",
                  "endDate": "2019"
                },
                {
                  "institution": "Stanford University",
                  "degree": "MS in Computer Science",
                  "startDate": "2013",
                  "endDate": "2015"
                }
            ],
            "certifications": [
                { "name": "TensorFlow Advanced Certification", "issuer": "Google", "year": "2022" },
                { "name": "AWS Machine Learning Specialty", "issuer": "Amazon", "year": "2021" },
                { "name": "Robotics System Architecture", "issuer": "ROS Industrial", "year": "2020" }
            ],
            "posts": [],
            "infoItems": [
                { "title": "Looking for", "content": "Collaboration with researchers and engineers in AI robotics" }
            ],
            "innovations": [
                {
                  "id": "innovation-1",
                  "title": "HealthBot Assistant",
                  "description": "An AI-powered robotic assistant for hospitals that automates routine tasks and enhances patient care. Implemented in 5 major hospitals, reducing nurse workload by 30%.",
                  "status": "In Progress",
                  "technologies": ["ROS", "Python", "TensorFlow", "Computer Vision", "Sensor Fusion"]
                },
                {
                  "id": "innovation-2",
                  "title": "Smart Manufacturing Automation",
                  "description": "Developed an intelligent robotic system for flexible manufacturing, featuring real-time adaptation and collaborative robot-human workflows.",
                  "status": "Completed",
                  "technologies": ["Industrial IoT", "Machine Learning", "Robot Operating System", "C++"]
                },
                {
                  "id": "innovation-3",
                  "title": "Autonomous Navigation Framework",
                  "description": "Open-source framework for robust indoor-outdoor robot navigation, used by over 1000 developers worldwide.",
                  "status": "In Progress",
                  "technologies": ["SLAM", "Path Planning", "Sensor Integration", "Python"]
                }
            ],
            "packages": [
                {
                  "id": "robotics-consulting",
                  "name": "Robotics Innovation Consulting",
                  "price": 2500,
                  "description": "Strategic consulting for robotics implementation",
                  "features": [
                    "Technical Architecture Review",
                    "AI Integration Strategy",
                    "Robotics System Design",
                    "Implementation Roadmap"
                  ]
                },
                {
                  "id": "startup-mentoring",
                  "name": "AI Startup Mentoring",
                  "price": 1500,
                  "description": "Guidance for AI/robotics startups",
                  "features": [
                    "Business Model Validation",
                    "Technical Stack Review",
                    "Market Entry Strategy",
                    "Funding Preparation"
                  ]
                },
                {
                  "id": "workshop-training",
                  "name": "Hands-on Robotics Workshop",
                  "price": 3500,
                  "description": "Intensive training for teams",
                  "features": [
                    "Custom Curriculum",
                    "Practical Exercises",
                    "Code Reviews",
                    "Implementation Support",
                    "3-Month Follow-up"
                  ]
                }
            ],
            "availableDates": [
                datetime(2024, 3, 15),
                datetime(2024, 3, 16),
                datetime(2024, 3, 20),
                datetime(2024, 3, 21)
            ]
        },
        "janedoe": {
            "fullName": "Jane Doe",
            "industry": "UX Design",
            "skills": ["Figma", "User Research", "Prototyping"],
            "bio": "UX designer focused on creating intuitive and accessible digital experiences.",
            "education": [{
                "institution": "Design Academy",
                "degree": "BA Design",
                "startDate": "2019",
                "endDate": "2019"
            }],
            "certifications": [{ "name": "Certified UX Professional", "issuer": "UX Alliance", "year": "2020" }],
            "posts": [],
            "infoItems": [
                { "title": "Looking for", "content": "Collaboration with developers and product managers" },
            ],
            "innovations": [
                {
                  "id": "innovation-1",
                  "title": "Accessibility Design System",
                  "description": "A comprehensive design system focused on making web applications more accessible to all users.",
                  "status": "In Progress",
                  "technologies": ["Figma", "WCAG", "Design Systems"]
                }
            ],
            "packages": [
                {
                  "id": "ux-basic",
                  "name": "UX Audit",
                  "price": 299,
                  "description": "Comprehensive UX analysis",
                  "features": ["Heuristic Evaluation", "User Flow Analysis", "Basic Recommendations"],
                }
            ],
            "availableDates": [
                datetime(2024, 3, 10),
                datetime(2024, 3, 15),
                datetime(2024, 3, 22)
            ]
        }
    }

    # Create additional test profiles
    sample_profiles["mariagarcia"] = {
        "fullName": "Maria Garcia",
        "industry": "AI Software Development",
        "skills": ["Python", "TensorFlow", "Machine Learning", "NLP", "Computer Vision"],
        "bio": "AI developer specializing in natural language processing and computer vision applications. Building tools that make AI more accessible to businesses.",
        "education": [
            {
                "institution": "University of California, Berkeley",
                "degree": "MS in Computer Science",
                "startDate": "2017",
                "endDate": "2019"
            }
        ],
        "certifications": [
            {"name": "Deep Learning Specialization", "issuer": "Coursera", "year": "2021"},
            {"name": "Professional Machine Learning Engineer", "issuer": "Google Cloud", "year": "2022"}
        ],
        "posts": [],
        "infoItems": [
            {"title": "Looking for", "content": "Collaboration with robotics engineers and AI researchers"}
        ],
        "innovations": [
            {
                "id": "innovation-1",
                "title": "NLP Analysis Framework",
                "description": "A framework for analyzing customer feedback and support tickets using advanced NLP techniques.",
                "status": "Completed",
                "technologies": ["Python", "TensorFlow", "BERT", "NLP"]
            },
            {
                "id": "innovation-2",
                "title": "Computer Vision Quality Control",
                "description": "An automated quality control system for manufacturing using computer vision.",
                "status": "In Progress",
                "technologies": ["Computer Vision", "Python", "TensorFlow", "Edge Computing"]
            }
        ],
        "packages": [
            {
                "id": "ai-implementation",
                "name": "AI Implementation Strategy",
                "price": 1800,
                "description": "Tailored AI strategy for businesses",
                "features": [
                    "Needs Assessment",
                    "Solution Architecture",
                    "Implementation Plan",
                    "ROI Analysis"
                ]
            }
        ],
        "availableDates": [
            datetime(2024, 3, 16),
            datetime(2024, 3, 18),
            datetime(2024, 3, 21)
        ]
    }

    # Create the matcher and find matches
    matcher = ProfileMatcher(sample_profiles)

    # Find matches for a specific user
    johnsmith_matches = matcher.find_matches("johnsmith")
    print(f"Matches for {sample_profiles['johnsmith']['fullName']}:")
    print(json.dumps(johnsmith_matches, indent=2, default=str))

    # Find matches for all users
    all_matches = matcher.find_all_matches()
    print("\nAll matches:")
    print(json.dumps(all_matches, indent=2, default=str))

Matches for Alex Chen:
[
  {
    "id": "mariagarcia",
    "name": "Maria Garcia",
    "industry": "AI Software Development",
    "match_score": 47,
    "match_reasons": [
      "Shared expertise in: python, computer vision, machine learning",
      "Both available on: 2024-03-16, 2024-03-21",
      "Potential for innovation collaboration",
      "Mentorship potential: Alex Chen can mentor Maria Garcia"
    ]
  },
  {
    "id": "janedoe",
    "name": "Jane Doe",
    "industry": "UX Design",
    "match_score": 22,
    "match_reasons": [
      "Both available on: 2024-03-15",
      "Potential for innovation collaboration",
      "Mentorship potential: Alex Chen can mentor Jane Doe"
    ]
  }
]

All matches:
{
  "johnsmith": [
    {
      "id": "mariagarcia",
      "name": "Maria Garcia",
      "industry": "AI Software Development",
      "match_score": 47,
      "match_reasons": [
        "Shared expertise in: python, computer vision, machine learning",
        "Both available on: 2024-03

In [7]:
import os
import json
import time
import random
from typing import Dict, List, Any
import google.generativeai as genai
from google.api_core.exceptions import NotFound, ResourceExhausted

# Set up Gemini API
def setup_gemini_api(api_key: str):
    """Initialize the Gemini API with the provided API key."""
    genai.configure(api_key=api_key)

# Define the profile data structure based on the provided schema
class ProfileData:
    def __init__(self, data: Dict[str, Any]):
        self.fullName = data.get("fullName", "")
        self.industry = data.get("industry", "")
        self.skills = data.get("skills", [])
        self.bio = data.get("bio", "")
        self.education = data.get("education", [])
        self.certifications = data.get("certifications", [])
        self.innovations = data.get("innovations", [])
        self.packages = data.get("packages", [])
        self.infoItems = data.get("infoItems", [])

def extract_profile_summary(profile: ProfileData) -> str:
    """Extract key information from a profile for matching purposes."""
    education_str = ", ".join([f"{edu.get('degree', '')} from {edu.get('institution', '')}"
                              for edu in profile.education])

    cert_str = ", ".join([f"{cert.get('name', '')} from {cert.get('issuer', '')}"
                          for cert in profile.certifications])

    innovation_str = ", ".join([f"{inno.get('title', '')}: {inno.get('description', '')[:100]}... Technologies: {', '.join(inno.get('technologies', []))}"
                              for inno in profile.innovations])

    looking_for = next((item.get("content", "") for item in profile.infoItems if item.get("title") == "Looking for"), "")

    summary = f"""
    Name: {profile.fullName}
    Industry: {profile.industry}
    Skills: {', '.join(profile.skills)}
    Bio: {profile.bio}
    Education: {education_str}
    Certifications: {cert_str}
    Innovations: {innovation_str}
    Looking for: {looking_for}
    """
    return summary

def get_available_models():
    """List available Gemini models."""
    try:
        models = genai.list_models()
        gemini_models = [model.name for model in models if "gemini" in model.name.lower()]
        return gemini_models
    except Exception as e:
        print(f"Error listing models: {e}")
        return []

def find_matches(user_id: str, user_profile: ProfileData, all_profiles: Dict[str, ProfileData],
                model_name: str = "gemini-2.0-pro-exp-02-05", num_matches: int = 3) -> List[Dict[str, Any]]:
    """Find the closest matching profiles for a given user using Gemini AI."""
    user_summary = extract_profile_summary(user_profile)

    # Create a model instance with the specified model
    try:
        model = genai.GenerativeModel(model_name)
    except Exception as e:
        print(f"Error creating model instance with {model_name}: {e}")
        # Return empty matches if model can't be created
        return []

    potential_matches = []
    for profile_id, profile_data in all_profiles.items():
        if profile_id == user_id:  # Skip the same user
            continue

        profile_summary = extract_profile_summary(profile_data)
        potential_matches.append((profile_id, profile_data, profile_summary))

    results = []
    for match_idx, (profile_id, profile_data, profile_summary) in enumerate(potential_matches):
        print(f"Processing match {match_idx+1}/{len(potential_matches)}: {user_profile.fullName} with {profile_data.fullName}")

        # Create a prompt for Gemini to analyze the match
        prompt = f"""
        You are an AI-powered networking algorithm for an entrepreneurial platform called iNNov8.
        Your task is to analyze how well two profiles match in terms of potential collaboration,
        mentorship, or investment opportunities.

        Profile 1:
        {user_summary}

        Profile 2:
        {profile_summary}

        Rate the match on a scale of 0-100 based on:
        1. Skill complementarity (similar or complementary skills)
        2. Industry alignment
        3. Potential collaboration opportunities
        4. Mentor/mentee relationship potential
        5. Innovation synergies

        Return your response in JSON format with these fields:
        - match_score: number between 0-100
        - match_reasoning: detailed explanation of why these profiles match
        - collaboration_opportunities: specific ways these individuals could work together
        - key_match_points: list of the strongest matching points
        """

        # Implement retry logic with exponential backoff
        max_retries = 4  # Increased max retries
        retry_delay = 5  # Increased initial delay in seconds

        for retry_attempt in range(max_retries + 1):
            try:
                # Generate match analysis using Gemini
                response = model.generate_content(prompt)
                response_text = response.text

                # Try to parse the JSON response
                try:
                    # Clean the response text to ensure it's valid JSON
                    # Sometimes the model might include markdown formatting or extra text
                    json_str = response_text
                    if "```json" in json_str:
                        json_str = json_str.split("```json")[1].split("```")[0].strip()
                    elif "```" in json_str:
                        json_str = json_str.split("```")[1].split("```")[0].strip()

                    match_data = json.loads(json_str)

                    # Add profile information to the result
                    result = {
                        "profile_id": profile_id,
                        "name": profile_data.fullName,
                        "industry": profile_data.industry,
                        "match_score": match_data.get("match_score", 0),
                        "match_reasoning": match_data.get("match_reasoning", ""),
                        "collaboration_opportunities": match_data.get("collaboration_opportunities", ""),
                        "key_match_points": match_data.get("key_match_points", [])
                    }
                    # Break out of retry loop on success
                    break

                except json.JSONDecodeError:
                    # If response is not valid JSON, attempt to extract information manually
                    print(f"Response is not valid JSON for {user_id} - {profile_id}. Attempting to parse manually.")
                    print(f"Response: {response_text[:100]}...")  # Print the first 100 chars for debugging

                    # Simple parsing fallback
                    match_score = 50  # Default score
                    try:
                        # Try to extract score with regex or simple parsing
                        import re
                        score_match = re.search(r'match_score"?\s*:\s*(\d+)', response_text)
                        if score_match:
                            match_score = int(score_match.group(1))
                    except Exception as parse_err:
                        print(f"Error parsing match score: {parse_err}")

                    result = {
                        "profile_id": profile_id,
                        "name": profile_data.fullName,
                        "industry": profile_data.industry,
                        "match_score": match_score,
                        "match_reasoning": "Unable to generate structured analysis. The profiles appear to have potential for collaboration based on their background.",
                        "collaboration_opportunities": "Consider exploring mutual interests in their respective domains.",
                        "key_match_points": ["Industry knowledge", "Skill complementarity"]
                    }
                    # Break out of retry loop with fallback result
                    break

            except ResourceExhausted as quota_error:
                if retry_attempt < max_retries:
                    # Calculate exponential backoff with jitter
                    jitter = random.uniform(0, 0.3 * retry_delay)
                    sleep_time = retry_delay + jitter
                    print(f"Quota exceeded. Retrying in {sleep_time:.2f}s... (Attempt {retry_attempt + 1}/{max_retries})")
                    time.sleep(sleep_time)
                    retry_delay *= 3  # More aggressive exponential backoff
                else:
                    print(f"Error after {max_retries} retries for match between {user_id} and {profile_id}: {quota_error}")
                    # Create a fallback result for the final retry failure
                    result = {
                        "profile_id": profile_id,
                        "name": profile_data.fullName,
                        "industry": profile_data.industry,
                        "match_score": 50,  # Default middle score
                        "match_reasoning": "Analysis not available due to API quota limits.",
                        "collaboration_opportunities": "Consider exploring potential synergies manually.",
                        "key_match_points": ["Manual review recommended due to API limitations"]
                    }
            except Exception as e:
                print(f"Error generating content for match between {user_id} and {profile_id}: {e}")
                # Add a basic fallback result
                result = {
                    "profile_id": profile_id,
                    "name": profile_data.fullName,
                    "industry": profile_data.industry,
                    "match_score": 50,  # Default middle score
                    "match_reasoning": "Unable to generate detailed match analysis due to an error.",
                    "collaboration_opportunities": "Consider exploring potential synergies manually.",
                    "key_match_points": ["Manual review recommended"]
                }
                break  # Exit retry loop on non-quota errors

        results.append(result)

        # Add delay between processing individual matches within a profile
        if match_idx < len(potential_matches) - 1:
            match_delay = random.uniform(5.0, 10.0)  # 5-10 second delay between matches
            print(f"Waiting {match_delay:.2f}s before processing next match pair...")
            time.sleep(match_delay)

    # Sort by match score (descending) and take top matches
    results.sort(key=lambda x: x["match_score"], reverse=True)
    return results[:num_matches]

def process_all_profiles(profiles: Dict[str, Dict[str, Any]], api_key: str,
                        num_matches: int = 3) -> Dict[str, List[Dict[str, Any]]]:
    """Process all profiles and find matches for each user."""
    setup_gemini_api(api_key)

    # Set the specific model to use regardless of available models
    model_to_use = "gemini-2.0-pro-exp-02-05"
    print(f"Using model: {model_to_use}")

    # Convert raw profile data to ProfileData objects
    profile_objects = {user_id: ProfileData(data) for user_id, data in profiles.items()}

    # Add an initial wait to ensure any pending quota is cleared
    initial_wait = random.uniform(5.0, 10.0)
    print(f"Waiting {initial_wait:.2f}s before starting API requests...")
    time.sleep(initial_wait)

    # Find matches for each profile
    all_matches = {}
    for idx, (user_id, profile_data) in enumerate(profile_objects.items()):
        print(f"Finding matches for {profile_data.fullName}...")
        matches = find_matches(user_id, profile_data, profile_objects, model_to_use, num_matches)
        all_matches[user_id] = matches

        # Add a longer delay between profiles to avoid hitting quota limits
        # Don't add delay after the last profile
        if idx < len(profile_objects) - 1:
            delay = random.uniform(10.0, 20.0)  # Longer random delay between 10-20 seconds
            print(f"Waiting {delay:.2f}s before processing next profile to avoid quota issues...")
            time.sleep(delay)

    return all_matches

def compare_profiles_without_gemini(user_profile: ProfileData, other_profile: ProfileData) -> Dict[str, Any]:
    """Calculate a match score without using Gemini API (fallback method)."""
    # Calculate skill overlap
    user_skills_set = set(user_profile.skills)
    other_skills_set = set(other_profile.skills)
    skill_overlap = len(user_skills_set.intersection(other_skills_set))
    skill_score = min(100, (skill_overlap / max(1, len(user_skills_set))) * 100)

    # Industry match
    industry_score = 100 if user_profile.industry == other_profile.industry else 50

    # Education level similarity (rough approximation)
    education_score = 0
    if user_profile.education and other_profile.education:
        user_highest = user_profile.education[0].get("degree", "")
        other_highest = other_profile.education[0].get("degree", "")
        if "Ph.D" in user_highest and "Ph.D" in other_highest:
            education_score = 100
        elif ("Master" in user_highest or "MS" in user_highest or "MA" in user_highest) and \
             ("Master" in other_highest or "MS" in other_highest or "MA" in other_highest):
            education_score = 90
        elif ("Bachelor" in user_highest or "BS" in user_highest or "BA" in user_highest) and \
             ("Bachelor" in other_highest or "BS" in other_highest or "BA" in other_highest):
            education_score = 80
        else:
            education_score = 60

    # Calculate overall score with weights
    overall_score = (skill_score * 0.4) + (industry_score * 0.4) + (education_score * 0.2)

    return {
        "match_score": round(overall_score),
        "match_reasoning": f"Match based on {skill_overlap} shared skills and industry alignment.",
        "collaboration_opportunities": "Consider exploring potential collaborations based on shared skills and interests.",
        "key_match_points": [
            f"Skill overlap: {skill_overlap} shared skills" if skill_overlap > 0 else "Complementary skills",
            f"Industry alignment: {'Same industry' if industry_score == 100 else 'Different but related industries'}"
        ]
    }

def fallback_matching(profiles: Dict[str, Dict[str, Any]], num_matches: int = 3) -> Dict[str, List[Dict[str, Any]]]:
    """Fallback method when Gemini API is not available."""
    print("Using fallback matching algorithm without Gemini...")

    # Convert raw profile data to ProfileData objects
    profile_objects = {user_id: ProfileData(data) for user_id, data in profiles.items()}

    # Find matches for each profile
    all_matches = {}
    for user_id, user_profile in profile_objects.items():
        user_matches = []
        for other_id, other_profile in profile_objects.items():
            if other_id == user_id:
                continue

            # Calculate match score without Gemini
            match_data = compare_profiles_without_gemini(user_profile, other_profile)

            # Create match result
            result = {
                "profile_id": other_id,
                "name": other_profile.fullName,
                "industry": other_profile.industry,
                "match_score": match_data["match_score"],
                "match_reasoning": match_data["match_reasoning"],
                "collaboration_opportunities": match_data["collaboration_opportunities"],
                "key_match_points": match_data["key_match_points"]
            }
            user_matches.append(result)

        # Sort by match score and take top matches
        user_matches.sort(key=lambda x: x["match_score"], reverse=True)
        all_matches[user_id] = user_matches[:num_matches]

    return all_matches

def save_matches_to_json(matches: Dict[str, List[Dict[str, Any]]], output_file: str = "profile_matches.json"):
    """Save the match results to a JSON file."""
    with open(output_file, 'w') as f:
        json.dump(matches, f, indent=2)
    print(f"Match results saved to {output_file}")

def display_matches(matches: Dict[str, List[Dict[str, Any]]]):
    """Display the match results in a readable format."""
    for user_id, user_matches in matches.items():
        print(f"\n{'='*80}")
        print(f"Matches for {user_id}:")
        print(f"{'='*80}")

        for i, match in enumerate(user_matches, 1):
            print(f"\n{i}. {match['name']} ({match['industry']})")
            print(f"   Match Score: {match['match_score']}")
            print(f"   Reasoning: {match['match_reasoning']}")
            print("\n   Collaboration Opportunities:")
            print(f"   {match['collaboration_opportunities']}")
            print("\n   Key Match Points:")
            for point in match['key_match_points']:
                print(f"   - {point}")
            print(f"{'-'*80}")

def process_single_profile(profile_id: str, profile_data: Dict[str, Any], all_profiles: Dict[str, Dict[str, Any]],
                     api_key: str, num_matches: int = 3) -> Dict[str, List[Dict[str, Any]]]:
    """Process a single profile to find matches, useful when having quota issues."""
    setup_gemini_api(api_key)

    # Set the specific model to use regardless of available models
    model_to_use = "gemini-2.0-pro-exp-02-05"
    print(f"Using model: {model_to_use}")

    # Convert profile data to ProfileData objects
    profile_objects = {user_id: ProfileData(data) for user_id, data in all_profiles.items()}
    user_profile = profile_objects[profile_id]

    print(f"Finding matches for {user_profile.fullName}...")
    matches = find_matches(profile_id, user_profile, profile_objects, model_to_use, num_matches)

    # Return the results in the same format as process_all_profiles
    return {profile_id: matches}

def main():
    # Load example profiles (in a real application, this might come from a database)
    # This uses the profiles from the provided schema
    profiles = {
        "johnsmith": {
            "fullName": "Alex Chen",
            "industry": "AI & Robotics Innovation",
            "skills": ["Machine Learning", "Robotics", "Computer Vision", "Python", "ROS"],
            "bio": "AI and robotics innovator specializing in autonomous systems. Founder of RoboAI Solutions, developing next-generation healthcare and manufacturing robotics. Previously led R&D at Boston Dynamics with focus on human-robot interaction.",
            "education": [
                {
                    "institution": "Massachusetts Institute of Technology",
                    "degree": "Ph.D. in Robotics Engineering",
                    "startDate": "2015",
                    "endDate": "2019"
                },
                {
                    "institution": "Stanford University",
                    "degree": "MS in Computer Science",
                    "startDate": "2013",
                    "endDate": "2015"
                }
            ],
            "certifications": [
                {"name": "TensorFlow Advanced Certification", "issuer": "Google", "year": "2022"},
                {"name": "AWS Machine Learning Specialty", "issuer": "Amazon", "year": "2021"},
                {"name": "Robotics System Architecture", "issuer": "ROS Industrial", "year": "2020"}
            ],
            "infoItems": [
                {"title": "Looking for", "content": "Collaboration with researchers and engineers in AI robotics"}
            ],
            "innovations": [
                {
                    "id": "innovation-1",
                    "title": "HealthBot Assistant",
                    "description": "An AI-powered robotic assistant for hospitals that automates routine tasks and enhances patient care. Implemented in 5 major hospitals, reducing nurse workload by 30%.",
                    "status": "In Progress",
                    "technologies": ["ROS", "Python", "TensorFlow", "Computer Vision", "Sensor Fusion"]
                },
                {
                    "id": "innovation-2",
                    "title": "Smart Manufacturing Automation",
                    "description": "Developed an intelligent robotic system for flexible manufacturing, featuring real-time adaptation and collaborative robot-human workflows.",
                    "status": "Completed",
                    "technologies": ["Industrial IoT", "Machine Learning", "Robot Operating System", "C++"]
                }
            ],
            "packages": []
        },
        "janedoe": {
            "fullName": "Jane Doe",
            "industry": "UX Design",
            "skills": ["Figma", "User Research", "Prototyping"],
            "bio": "UX designer focused on creating intuitive and accessible digital experiences.",
            "education": [{
                "institution": "Design Academy",
                "degree": "BA Design",
                "startDate": "2019",
                "endDate": "2019"
            }],
            "certifications": [{"name": "Certified UX Professional", "issuer": "UX Alliance", "year": "2020"}],
            "infoItems": [
                {"title": "Looking for", "content": "Collaboration with developers and product managers"},
            ],
            "innovations": [
                {
                    "id": "innovation-1",
                    "title": "Accessibility Design System",
                    "description": "A comprehensive design system focused on making web applications more accessible to all users.",
                    "status": "In Progress",
                    "technologies": ["Figma", "WCAG", "Design Systems"]
                }
            ],
            "packages": []
        },
        "sarahwilliams": {
            "fullName": "Sarah Williams",
            "industry": "Sustainable Energy",
            "skills": ["Solar Technology", "Energy Storage", "Project Management", "Electrical Engineering"],
            "bio": "Clean energy entrepreneur focused on affordable solar solutions for developing regions. Founded SunLink, bringing renewable energy to 50+ communities across Africa and Southeast Asia.",
            "education": [
                {
                    "institution": "University of California, Berkeley",
                    "degree": "MS in Energy Engineering",
                    "startDate": "2017",
                    "endDate": "2019"
                }
            ],
            "certifications": [
                {"name": "PMP Certification", "issuer": "PMI", "year": "2021"},
                {"name": "Solar Energy International - Advanced Solar Training", "issuer": "SEI", "year": "2020"}
            ],
            "infoItems": [
                {"title": "Looking for", "content": "Impact investors and technical partners for scaling renewable energy projects"}
            ],
            "innovations": [
                {
                    "id": "innovation-1",
                    "title": "MicroGrid-in-a-Box",
                    "description": "Portable, rapidly deployable solar microgrid system for disaster relief and remote communities.",
                    "status": "In Progress",
                    "technologies": ["Solar PV", "Battery Storage", "IoT Monitoring", "Mobile Payment Integration"]
                }
            ],
            "packages": []
        }
    }

    # Get your Gemini API key (in a real application, use environment variables)
    api_key = os.environ.get("GEMINI_API_KEY", "AIzaSyDgXYI-mgGcFHbDkwhqiCi-qpwyDNh0e98")

    # Set the number of matches to find per profile
    num_matches = 2

    # Flag to process only one profile at a time (helps with quota issues)
    process_single = True  # Set to False to process all profiles
    selected_profile_id = "johnsmith"  # Change this to the profile you want to process

    # Try to use Gemini API for matching
    try:
        if process_single:
            print(f"Processing single profile: {profiles[selected_profile_id]['fullName']}...")
            matches = process_single_profile(
                selected_profile_id,
                profiles[selected_profile_id],
                profiles,
                api_key,
                num_matches
            )
        else:
            print("Starting profile matching using Gemini AI for all profiles...")
            matches = process_all_profiles(profiles, api_key, num_matches=num_matches)

        # Check if any profiles have empty matches and use fallback for those
        empty_match_profiles = [user_id for user_id, user_matches in matches.items() if not user_matches]

        if empty_match_profiles:
            print(f"\nFound {len(empty_match_profiles)} profiles with no matches. Using fallback for these profiles...")
            fallback_matches = fallback_matching(
                {user_id: profiles[user_id] for user_id in empty_match_profiles},
                num_matches=num_matches
            )

            # Merge the fallback matches with the AI matches
            for user_id, user_matches in fallback_matches.items():
                matches[user_id] = user_matches
                print(f"Added fallback matches for {profiles[user_id]['fullName']}")

    except Exception as e:
        print(f"Error using Gemini API with model gemini-2.0-pro-exp-02-05: {e}")
        print("Falling back to non-AI matching algorithm for all profiles...")

        if process_single:
            # Use fallback matching for the single profile
            matches = fallback_matching(
                {selected_profile_id: profiles[selected_profile_id]},
                num_matches=num_matches
            )
        else:
            # Use fallback matching for all profiles
            matches = fallback_matching(profiles, num_matches=num_matches)

    # Save matches to JSON file
    save_matches_to_json(matches)

    # Display matches in console
    display_matches(matches)

    print("\nMatching process completed!")
    print("Note: If you experienced quota errors, consider:")
    print("  1. Setting process_single=True to process only one profile at a time")
    print("  2. Waiting several minutes or hours before trying again")
    print("  3. Using a different API key with higher quota limits")
    print("  4. Experimenting with the gemini-1.5-pro model which may have different quota limits")

if __name__ == "__main__":
    main()

Processing single profile: Alex Chen...
Using model: gemini-2.0-pro-exp-02-05
Finding matches for Alex Chen...
Processing match 1/2: Alex Chen with Jane Doe
Waiting 9.31s before processing next match pair...
Processing match 2/2: Alex Chen with Sarah Williams
Match results saved to profile_matches.json

Matches for johnsmith:

1. Jane Doe (UX Design)
   Match Score: 65
   Reasoning: Alex Chen and Jane Doe have a moderate match potential. While their primary industries (AI & Robotics vs. UX Design) are different, there's significant potential for collaboration where robotics and user experience intersect, particularly in the area of human-robot interaction.  Alex's focus on autonomous systems and robotics, combined with Jane's expertise in creating intuitive and accessible digital experiences, could lead to innovative solutions.  The match isn't perfect due to the lack of direct skill overlap, but the potential for synergistic innovation is present.

   Collaboration Opportunities:
   [