In [13]:
import re
import json
import requests
import pdfplumber
import pandas as pd
import snowflake.connector  # Import Snowflake connector
from snowflake.connector import DictCursor
from collections import Counter
from datetime import datetime

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pdf_text = ''
        for page in pdf.pages:
            pdf_text += page.extract_text()
    return pdf_text

# Function to parse resume content
def parse_resume(ocr_text):
    lines = ocr_text.split('\n')
    name = ''
    email = ''
    phone = ''
    dob = ''
    experience = ''
    current_company = ''
    college = ''
    skills = []
    in_skills_section = False
    phone_pattern = r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]'
    dob_pattern = r'\d{1,2}-\d{1,2}-\d{4}'
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    for line in lines:
        line = line.strip()

        if not phone:
            match = re.search(phone_pattern, line)
            if match:
                phone = match.group(0)

        if not email:
            email_match = re.search(email_pattern, line)
            if email_match:
                email = email_match.group(0)

        if not dob and ('DOB' in line or 'Date of Birth' in line):
            dob_match = re.search(dob_pattern, line)
            if dob_match:
                dob_value = dob_match.group(0)
                try:
                    dob = datetime.strptime(dob_value, '%d-%m-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    dob = ''

        if 'Experience' in line or 'Years of Experience' in line:
            experience = line

        if 'Current Company' in line or 'Company' in line:
            current_company = line

        if 'College' in line or 'University' in line:
            college = line

        if 'Skills' in line or 'Technical Skills' in line:
            in_skills_section = True
            continue

        if line and not name:
            name = line

        if in_skills_section:
            if line:
                skills.append(line.strip())

    return name, email, phone, dob, experience, current_company, college, skills


def count_words(ocr_text):
    words = ocr_text.split()
    return len(words)


def most_common_words(ocr_text, num_common=2):
    words = re.findall(r'\b\w+\b', ocr_text.lower())
    common_words = Counter(words).most_common(num_common)
    return common_words

# Function to send request to Gemini API
def send_request_to_gemini(prompt):
    gemini_url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key=AIzaSyDdQoG3AzuDPkO1o-rxAGmLmQbpIOqo-As'
    headers = {
        'Content-Type': 'application/json',
    }
    try:
        response = requests.post(gemini_url, headers=headers, json=prompt)
        if response.status_code == 200:
            parsed_data = response.json()
            if 'text' in parsed_data and parsed_data['text']:
                generated_text = parsed_data['text']
            else:
                generated_text = None
            return generated_text
        else:
            print(f"Request failed with status code {response.status_code}")
            print(response.text)
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error with API request: {e}")
        return None


def insert_into_snowflake(df):
    
    conn_params = {
        'account': 'akehzhr-rt23734',
        'user': 'JERISH',
        'password': 'Mahendrasingh@7',
        'warehouse': 'COMPUTE_WH',
        'database': 'EMPLOYEDB',
        'schema': 'EMPLOYESCHEMA'
    }

    conn = None
    try:
        
        conn = snowflake.connector.connect(
            user=conn_params['user'],
            password=conn_params['password'],
            account=conn_params['account'],
            warehouse=conn_params['warehouse'],
            database=conn_params['database'],
            schema=conn_params['schema']
        )

        cur = conn.cursor()

       
        for index, row in df.iterrows():
            insert_query = """
                INSERT INTO resumes (name, email, phone, dob, college)
                VALUES (%s, %s, %s, %s, %s)
            """
            dob_value = row['DOB'] if row['DOB'] else None
            cur.execute(insert_query, (
                row['Name'],
                row['Email'],
                row['Phone'],
                dob_value,
                # row['Skills'],
                row['College']
            ))

        conn.commit()
        print("Data inserted successfully into Snowflake")

    except snowflake.connector.Error as e:
        print(f"Error inserting data into Snowflake: {e}")

    finally:
        if conn:
            conn.close()

# Main function
def main():
    pdf_file_path = 'Resume.pdf'
    ocr_text = extract_text_from_pdf(pdf_file_path)
    name, email, phone, dob, experience, current_company, college, skills = parse_resume(ocr_text)
    total_words = count_words(ocr_text)
    common_words = most_common_words(ocr_text)

    prompt = {
        "contents": [
            {
                "parts": [
                    {
                        "text": f"Given the resume, fetch the name: {name}, email: {email}, phone: {phone}, dob: {dob}, experience: {experience}, current company: {current_company}, college: {college}, top 5 skills: {', '.join(skills)}, vertica as one of Full stack, Data Engineering, Dev Ops, Manual Testing, Automation."
                    }
                ]
            }
        ]
    }

    prompt_json = json.dumps(prompt)
    generated_text = send_request_to_gemini(prompt)
    df = pd.DataFrame({
        'Name': [name],
        'Email': [email],
        'Phone': [phone],
        'DOB': [dob],
        'Experience': [experience],
        'Current Company': [current_company],
        'College': [college],
        'Skills': [skills],
        'Generated Text': [generated_text],
        'Total Words': [total_words],
        'Most Common Words': [common_words]
    })

    print("DataFrame created:")
    print(df)

    insert_into_snowflake(df)

if __name__ == "__main__":
    main()

DataFrame created:
           Name                     Email           Phone DOB  \
0  Rajesh Kumar  rajesh.kumar@example.com  +91 9876543210       

                                          Experience Current Company  \
0  • Experienced in using industry-standard tools...                   

                                 College  \
0  ABC College of Engineering, Bangalore   

                                              Skills Generated Text  \
0  [• Proficient in test planning, test case desi...           None   

   Total Words     Most Common Words  
0          211  [(in, 10), (and, 8)]  
Data inserted successfully into Snowflake


In [1]:
!pip install snowflake-connector-python

Collecting snowflake-connector-python
  Downloading snowflake_connector_python-3.11.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 kB[0m [31m643.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python)
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl.metadata (13 kB)
Collecting filelock<4,>=3.5 (from snowflake-connector-python)
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting sortedcontainers>=2.4.0 (from snowflake-connector-python)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting tomlkit (from snowflake-connector-python)
  Downloading tomlkit-0.12.5-py3-none-any.whl.metadata (2.7 kB)
Downloading snowflake_connector_python-3.11.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━