In [3]:
import pandas as pd
import numpy as np


In [4]:
dt=pd.read_csv('final_data.csv')
dt.columns

Index(['Experience', 'Qualifications', 'Salary Range', 'location', 'Country',
       'Work Type', 'Company Size', 'Preference', 'Job Title', 'Role',
       'Job Description', 'Benefits', 'skills', 'Responsibilities', 'Company',
       'Company Profile'],
      dtype='object')

In [5]:

import re
def clean_text(text):
    if isinstance(text, str):
        text = re.sub('<.*?>', '', text)  # Remove HTML tags
        text = re.sub('[^\w\s]', '', text)  # Remove punctuation
        text = re.sub('\s+', ' ', text)  # Replace multiple spaces with a single space
        text = text.strip()  # Remove leading and trailing whitespace
    return text

# List of columns to clean
text_columns = ['Job Description', 'Qualifications', 'skills', 'Responsibilities']

# Applying the cleaning function to each text column
for column in text_columns:
    dt[column] = dt[column].apply(clean_text)


In [6]:

# Option 2: Drop rows with any missing values
dt.dropna(inplace=True)


In [7]:
print(dt.head())  
print(dt.info())  


      Experience Qualifications Salary Range              location  \
0  4 to 10 Years            MBA    $57K-$81K                Gitega   
1  4 to 12 Years            BCA   $58K-$108K             Road Town   
2   5 to 8 Years           BCom   $64K-$115K  The City of Hamilton   
3  3 to 13 Years            PhD   $61K-$121K                 Sofia   
4  4 to 15 Years           BCom    $58K-$95K            San Marino   

                  Country  Work Type  Company Size Preference  \
0                 Burundi   Contract         95178       Both   
1  British Virgin Islands  Temporary         67705       Both   
2                 Bermuda  Part-Time        128626     Female   
3                Bulgaria  Temporary        118645       Male   
4              San Marino  Full-Time         22136       Both   

             Job Title                                    Role  \
0  Electrical Engineer           Electronics Hardware Engineer   
1      Sales Associate                  Retail Sales Ass

In [17]:
# Combining text data into a single column for simplicity
dt['combined_text'] = dt['skills'] + ' ' + dt['Experience'] + ' ' + dt['Preference'] + ' ' + dt['Qualifications']



In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), stop_words='english')
X_features = tfidf_vectorizer.fit_transform(dt['combined_text'])


In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(dt['Job Title'])


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.2, random_state=42)


In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(256, activation='relu', input_dim=X_features.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer for multi-class

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/20
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.4209 - loss: 2.8246 - val_accuracy: 0.9965 - val_loss: 0.0311
Epoch 2/20
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9601 - loss: 0.1770 - val_accuracy: 1.0000 - val_loss: 0.0015
Epoch 3/20
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9871 - loss: 0.0647 - val_accuracy: 1.0000 - val_loss: 3.6440e-04
Epoch 4/20
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.9926 - loss: 0.0344 - val_accuracy: 1.0000 - val_loss: 7.2243e-05
Epoch 5/20
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9938 - loss: 0.0258 - val_accuracy: 1.0000 - val_loss: 1.5211e-05
Epoch 6/20
[1m1311/1311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.9948 - loss: 0.0200 - val_accuracy: 1.0000 - val_loss: 1.0368e-05


In [23]:
# Evaluate on validation set
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {val_acc}")


[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 1.0000 - loss: 2.1216e-09
Validation Accuracy: 1.0


In [25]:
def recommend_job_title(skills, experience, preferences):
    # Combine the input features into a single string
    combined_input = f"{skills} {experience} {preferences}"
    
    # Transform the combined input using the pre-trained TF-IDF vectorizer
    input_vector = tfidf_vectorizer.transform([combined_input])
    
    # Use the trained neural network to make a prediction
    prediction = model.predict(input_vector)
    
    # Convert the predicted label back to the job title
    predicted_job_title = label_encoder.inverse_transform([np.argmax(prediction)])[0]
    
    return predicted_job_title


In [36]:
# Example user inputs
user_skills = "writing"
user_experience = ""
user_preferences = "remote work, flexible hours"

# Get the recommended job title
recommended_job = recommend_job_title(user_skills, user_experience, user_preferences)
print(f"Recommended Job Title: {recommended_job}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
Recommended Job Title: Copywriter
