In [28]:
import json
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [29]:
import json

file_path = "Entity Recognition in Resumes.json"

data = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

len(data)



220

In [30]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

resumes = [item["content"] for item in data]
clean_resumes = [clean_text(r) for r in resumes]

len(clean_resumes)



220

In [31]:
job_description = """
User Acquisition Intern with skills in:
social media marketing, instagram growth, community building,
basic data analysis, excel or sql, communication skills,
startup mindset and growth hacking.
"""

job_description = clean_text(job_description)



In [32]:
documents = clean_resumes + [job_description]

vectorizer_sim = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

tfidf_matrix_sim = vectorizer_sim.fit_transform(documents)

similarity_scores = cosine_similarity(
    tfidf_matrix_sim[:-1],
    tfidf_matrix_sim[-1]
).flatten()

len(similarity_scores)

220

In [33]:
threshold = np.percentile(similarity_scores, 70)

labels = np.array([1 if s >= threshold else 0 for s in similarity_scores])

np.unique(labels, return_counts=True)


(array([0, 1]), array([154,  66]))

In [34]:
print(
    "clean_resumes:", "yes" if "clean_resumes" in globals() else "no",
    "| job_description:", "yes" if "job_description" in globals() else "no",
    "| similarity_scores:", "yes" if "similarity_scores" in globals() else "no"
)


clean_resumes: yes | job_description: yes | similarity_scores: yes


In [35]:
from sklearn.model_selection import train_test_split

X = clean_resumes
y = labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

len(X_train), len(X_test)


(176, 44)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=8000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [37]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

model.fit(X_train_vec, y_train)


In [38]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.90      0.84        31
           1       0.62      0.38      0.48        13

    accuracy                           0.75        44
   macro avg       0.70      0.64      0.66        44
weighted avg       0.73      0.75      0.73        44


Confusion Matrix:
 [[28  3]
 [ 8  5]]


In [39]:
def predict_resume_fit(resume_text):
    resume_text = clean_text(resume_text)
    vec = vectorizer.transform([resume_text])
    prob = model.predict_proba(vec)[0][1]
    return prob


In [40]:
score = predict_resume_fit(resumes[0])
print(f"Jinino Fit Probability: {score:.2f}")


Jinino Fit Probability: 0.58


In [41]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test_vec)

accuracy_score(y_test, y_pred)


0.75

In [42]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.90      0.84        31
           1       0.62      0.38      0.48        13

    accuracy                           0.75        44
   macro avg       0.70      0.64      0.66        44
weighted avg       0.73      0.75      0.73        44



In [43]:
for i in range(3):
    print("Predicted:", model.predict(X_test_vec[i:i+1])[0])
    print("Probability:", model.predict_proba(X_test_vec[i:i+1])[0][1])
    print(X_test[i][:300])
    print("-"*50)


Predicted: 0
Probability: 0.4183492761431814
koushik katta devops hyderabad telangana email me on indeed indeed com r koushik katta a b ec devops administrator with an experience of years working in a challenging agile environment looking forward for a position where i can use my knowledge pursuing my domain interests i m more aligned to work 
--------------------------------------------------
Predicted: 0
Probability: 0.4367910010793647
puran mal jaipur rajasthan email me on indeed indeed com r puran mal ea b b be work experience admin assistant at infosys limited front office work education bachelor s skills front office executive year additional information technical skill basic knowledge of computer operating ms office ms excel 
--------------------------------------------------
Predicted: 0
Probability: 0.4143451997270892
madhava konjeti hr executive bengaluru karnataka email me on indeed indeed com r madhava konjeti a f ace c to contribute to the team success by working in a posi

In [44]:
!pip install ipywidgets PyPDF2





In [45]:
import ipywidgets as widgets
from IPython.display import display
import PyPDF2


In [46]:
from io import BytesIO
import PyPDF2

def predict_uploaded_pdf(uploaded_file):
    if not uploaded_file.value:
        print("‚ùå No file detected")
        return None

    # Handle BOTH tuple and dict formats
    if isinstance(uploaded_file.value, dict):
        uploaded_data = list(uploaded_file.value.values())[0]
    else:  # tuple format
        uploaded_data = uploaded_file.value[0]

    pdf_bytes = uploaded_data['content']

    # Read PDF from bytes
    reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))

    text = ""
    for page in reader.pages:
        extracted = page.extract_text()
        if extracted:
            text += extracted

    if len(text.strip()) == 0:
        print("‚ùå Could not extract text from PDF")
        return None

    # Clean
    cleaned = clean_text(text)

    # Vectorize (DO NOT FIT)
    vec = vectorizer.transform([cleaned])

    # Predict
    prob = model.predict_proba(vec)[0][1]

    return prob


In [47]:
upload = widgets.FileUpload(accept='.pdf', multiple=False)
display(upload)


FileUpload(value=(), accept='.pdf', description='Upload')

In [48]:
score = predict_uploaded_pdf(upload)
score


‚ùå No file detected


In [49]:
def jinino_decision(prob):
    if prob is None:
        return "‚ùå No resume uploaded"

    elif prob >= 0.75:
        return "‚úÖ FIT FOR JININO"

    else:
        return "‚ùå NOT A FIT FOR JININO"


MAKING THE WEBSITE 

In [50]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [51]:
pip install streamlit PyPDF2 scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [58]:
import streamlit as st
import pickle
import re
from io import BytesIO
import PyPDF2

# ---------- Load ML artifacts ----------
with open("model.pkl", "rb") as f:
    model = pickle.load(f)

with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

# ---------- Text cleaning ----------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ---------- Decision logic ----------
def jinino_decision(prob):
    if prob >= 0.75:
        return "‚úÖ FIT FOR JININO"
    else:
        return "‚ùå NOT A FIT FOR JININO"

# ---------- UI ----------
st.set_page_config(page_title="Jinino Resume Screener", page_icon="üìÑ")

st.title("üìÑ Jinino Resume Screening System")
st.write("Upload a resume PDF to check if the candidate is a good fit for Jinino.")

uploaded_file = st.file_uploader("Upload Resume (PDF only)", type=["pdf"])

if uploaded_file is not None:
    if st.button("Predict Fit"):
        reader = PyPDF2.PdfReader(BytesIO(uploaded_file.read()))
        text = ""

        for page in reader.pages:
            extracted = page.extract_text()
            if extracted:
                text += extracted

        if len(text.strip()) == 0:
            st.error("Could not extract text from PDF.")
        else:
            cleaned = clean_text(text)
            vec = vectorizer.transform([cleaned])
            prob = model.predict_proba(vec)[0][1]

            st.subheader("üîç Result")
            st.write(f"**Fit Score:** `{prob:.2f}`")
            st.success(jinino_decision(prob))




In [53]:
streamlit run app.py


SyntaxError: invalid syntax (507122745.py, line 1)

In [None]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [None]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)


In [54]:
import os
os.getcwd()


'/Users/user'

In [55]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)



In [56]:
import os
os.listdir()


['speech recognition.ipynb',
 'Untitled10.ipynb',
 'Untitled7.ipynb',
 'Hrishikesh Resume.pdf',
 '.config',
 'Music',
 'Expanded_data_with_more_features.csv',
 '.condarc',
 '.docker',
 '.gemini',
 'Untitled12.ipynb',
 'Mall_Customers.csv',
 'Untitled5.ipynb',
 'python basics.ipynb',
 'Sonar Rock vs mine prediction.ipynb',
 'Films.csv',
 'Untitled1.ipynb',
 'Book 2(Sheet1).csv',
 'Untitled16.ipynb',
 '.DS_Store',
 'intro to matplotlib.ipynb',
 'Creating Data Frames.ipynb',
 '.CFUserTextEncoding',
 'Project 13. Customer Segmentation using K-Means Clustering with Python | Machine Learning Projects.ipynb',
 'dogs_only',
 'Car Prediction Model.ipynb',
 'gold price prediction.ipynb',
 '.xonshrc',
 'indexing.ipynb',
 'anaconda_projects',
 'Untitled3.ipynb',
 'Untitled14.ipynb',
 'Untitled.ipynb',
 'car data.csv',
 'heart_disease_data.csv',
 '.zshrc',
 'archive (4).zip',
 'Music taste prediction model.ipynb',
 'Untitled4.ipynb',
 '.streamlit',
 'Untitled13.ipynb',
 'Untitled6.ipynb',
 'Picture

In [57]:
import streamlit as st

st.title("Jinino Resume Screening App")
st.write("If you can see this, Streamlit is working.")






In [59]:
import streamlit as st
import pickle
import re
from io import BytesIO
import PyPDF2

st.set_page_config(page_title="Jinino Resume Screener", layout="centered")

# ---------- UI HEADER ----------
st.title("üìÑ Jinino Resume Screening System")
st.write("Upload a resume PDF to check if the candidate is a fit for Jinino.")

# ---------- SAFE MODEL LOADING ----------
@st.cache_resource
def load_model():
    with open("model.pkl", "rb") as f:
        model = pickle.load(f)
    with open("vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)
    return model, vectorizer

try:
    model, vectorizer = load_model()
    st.success("Model loaded successfully")
except Exception as e:
    st.error(f"‚ùå Model loading failed: {e}")
    st.stop()

# ---------- TEXT CLEANING ----------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# ---------- DECISION ----------
def jinino_decision(prob):
    return "‚úÖ FIT FOR JININO" if prob >= 0.75 else "‚ùå NOT A FIT FOR JININO"

# ---------- FILE UPLOAD ----------
uploaded_file = st.file_uploader("Upload Resume (PDF only)", type=["pdf"])

if uploaded_file is not None:
    if st.button("Predict Fit"):
        try:
            reader = PyPDF2.PdfReader(BytesIO(uploaded_file.read()))
            text = ""
            for page in reader.pages:
                if page.extract_text():
                    text += page.extract_text()

            if len(text.strip()) == 0:
                st.error("Could not extract text from the PDF.")
                st.stop()

            cleaned = clean_text(text)
            vec = vectorizer.transform([cleaned])
            prob = model.predict_proba(vec)[0][1]

            st.subheader("üîç Result")
            st.write(f"**Fit Score:** `{prob:.2f}`")
            st.success(jinino_decision(prob))

        except Exception as e:
            st.error(f"Prediction error: {e}")


