In [None]:

import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
import keras_tuner as kt  # Updated import for Keras Tuner
from tensorflow.keras.callbacks import EarlyStopping



In [None]:
# Load data
try:
    data = pd.read_csv('resumes_dataset.csv', encoding='utf-8')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("File not found. Please check the file path.")
    raise



In [None]:
# Inspect data
print("Data overview:")
print(data.head())
print("\nData info:")
print(data.info())
print("\nData description:")
print(data.describe())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Check for duplicates
print("\nDuplicate records:")
print(data.duplicated().sum())

# Remove duplicates
data.drop_duplicates(inplace=True)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
plt.xticks(rotation=90)
ax=sns.countplot(x="Category", data=data)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
plt.grid()



In [None]:
from matplotlib.gridspec import GridSpec
targetCounts = data['Category'].value_counts()
targetLabels  = data['Category'].unique()
# Make square figures and axes
plt.figure(1, figsize=(22,22))
the_grid = GridSpec(2, 2)


cmap = plt.get_cmap('coolwarm')
plt.subplot(the_grid[0, 1], aspect=1, title='CATEGORY DISTRIBUTION')

source_pie = plt.pie(targetCounts, labels=targetLabels, autopct='%1.1f%%', shadow=True)
plt.show()




In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import string

# Data cleaning function
def clean_text(text):
    text = re.sub(r'\d+', '', text)       # Remove numbers
    text = re.sub(r'\W+', ' ', text)      # Remove special characters
    text = text.lower()                   # Convert to lowercase
    words = word_tokenize(text)           # Tokenize the text
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(filtered_words)

# Apply cleaning to the "Resume" column
data['cleaned_resume'] = data['Resume'].apply(clean_text)

# Generate and display the word cloud
wc = WordCloud(width=800, height=400, background_color='white').generate(cleaned_sentences)
plt.figure(figsize=(15, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
# Prepare data for deep learning
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['cleaned_resume'])
X = tokenizer.texts_to_sequences(data['cleaned_resume'])

# Pad sequences
max_sequence_length = 200
X = pad_sequences(X, maxlen=max_sequence_length)
y = pd.get_dummies(data['Category']).values  # One-hot encode categories



In [None]:
# Define the model building function for Keras Tuner
def build_model(hp):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=hp.Int('embedding_dim', min_value=50, max_value=300, step=50), input_length=max_sequence_length),
        Bidirectional(LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32), return_sequences=True)),
        Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)),
        LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32)),
        Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)),
        Dense(y.shape[1], activation='softmax')
    ])
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Initialize the Keras Tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=10,
    hyperband_iterations=2,
    directory='kt_dir',
    project_name='resume_classifier',
    overwrite=True
)


# Perform the hyperparameter search
tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# Define the model building function using the best hyperparameters
def build_best_model(hp):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=best_hps.get('embedding_dim'), input_length=max_sequence_length),
        Bidirectional(LSTM(best_hps.get('lstm_units'), return_sequences=True)),
        Dropout(best_hps.get('dropout_rate')),
        LSTM(best_hps.get('lstm_units')),
        Dropout(best_hps.get('dropout_rate')),
        Dense(y.shape[1], activation='softmax')  # Adjust the output dimension based on the number of classes
    ])
    
    model.compile(optimizer=Adam(learning_rate=best_hps.get('learning_rate')),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Instantiate the model with the best hyperparameters
best_model = build_best_model(best_model)


In [None]:
# Evaluate the best model
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:

# Save the best model
best_model.save('models/deep_learning_model_with_embeddings.h5')

# Save the tokenizer
import joblib
joblib.dump(tokenizer, 'models/tokenizer.pkl')


In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
import keras_tuner as kt  # Updated import for Keras Tuner

# Load data
try:
    data = pd.read_csv('resumes_dataset.csv', encoding='utf-8')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("File not found. Please check the file path.")
    raise

# Data cleaning function
def clean_text(text):
    text = re.sub(r'\d+', '', text)       # Remove numbers
    text = re.sub(r'\W+', ' ', text)      # Remove special characters
    text = text.lower()                   # Convert to lowercase
    words = word_tokenize(text)           # Tokenize the text
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(filtered_words)

# Apply cleaning to the "Resume" column
data['cleaned_resume'] = data['Resume'].apply(clean_text)

# Prepare data for deep learning
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['cleaned_resume'])
X = tokenizer.texts_to_sequences(data['cleaned_resume'])

# Pad sequences
max_sequence_length = 200
X = pad_sequences(X, maxlen=max_sequence_length)
y = pd.get_dummies(data['Category']).values  # One-hot encode categories

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model building function for Keras Tuner
def build_model(hp):
    model = Sequential([
        Embedding(input_dim=5000, output_dim=hp.Int('embedding_dim', min_value=50, max_value=300, step=50), input_length=max_sequence_length),
        Bidirectional(LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32), return_sequences=True)),
        Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)),
        LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32)),
        Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)),
        Dense(y.shape[1], activation='softmax')
    ])
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Initialize the Keras Tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=10,
    hyperband_iterations=2,
    directory='kt_dir',
    project_name='resume_classifier',
    overwrite=True
)

# Perform the hyperparameter search
tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Save the best model
best_model.save('models/deep_learning_model_with_embeddings.h5')

# Save the tokenizer
import joblib
joblib.dump(tokenizer, 'models/tokenizer.pkl')
