In [5]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import os

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and t not in punctuation]
    return " ".join(tokens)

# Load raw data
input_path = r"D:\Python\ResuMetrics\data\training_data\resume_training.csv"
output_path = r"D:\Python\ResuMetrics\data\training_data\resume_training_processed.csv"
df = pd.read_csv(input_path)

# Remove nulls
df = df.dropna()

# Preprocess text
df['resume_text'] = df['resume_text'].apply(preprocess_text)
df['job_description'] = df['job_description'].apply(preprocess_text)

# Validate labels
df = df[df['label'].isin([0, 1])]

# Save processed data
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)

# Verify
print("Processed Data:")
print(df.head())
print("\nLabel Distribution:")
print(df['label'].value_counts())
print("\nMissing Values:")
print(df.isnull().sum())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Processed Data:
                                         resume_text  \
0  senior software engineer 6 years java cloud pl...   
1  data analyst 2 years experience sql tableau pr...   
2  full-stack engineer 5 years react nodejs mongo...   
3  finance manager 7 years corporate finance expe...   
4  sales data analyst 4 years sql looker salesfor...   

                                     job_description  label  
0  sr. software engineer cloud services ncr voyix...      1  
1  senior data engineer quantexa requiring 3+ yea...      0  
2  senior software engineer fullstack zeal requir...      1  
3  finance business analyst apex fintech solution...      0  
4  sales data analyst madhive requiring 3+ years ...      1  

Label Distribution:
label
1    121
0     45
Name: count, dtype: int64

Missing Values:
resume_text        0
job_description    0
label              0
dtype: int64
