In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

dataset_path = '/content/drive/MyDrive/datasets/sgp.csv'

# Load your dataset (replace 'your_data.csv' with your actual dataset file)
df = pd.read_csv(dataset_path, delimiter=',', quoting=3, error_bad_lines=False)

# Select relevant columns
selected_columns = ['jobdescription', 'skills', 'jobtitle', 'company']

# Fill missing values if any
df[selected_columns].fillna("", inplace=True)

# Encode categorical columns
label_encoders = {}
for col in selected_columns[:-1]:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Get the set of unique labels from the entire dataset
all_labels = set(df['company'].unique())

# Split the dataset into training and testing sets
X = df[selected_columns[:-1]]
y = df['company']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Filter out rows with unseen labels from both training and testing sets
y_train_labels = set(y_train.unique())
y_test_labels = set(y_test.unique())
unseen_labels = y_test_labels - y_train_labels

X_train = X_train[~y_train.isin(unseen_labels)]
y_train = y_train[~y_train.isin(unseen_labels)]
X_test = X_test[~y_test.isin(unseen_labels)]
y_test = y_test[~y_test.isin(unseen_labels)]

# Ensure 'jobdescription' column is treated as text
X_train_text = X_train['jobdescription'].astype(str)
X_test_text = X_test['jobdescription'].astype(str)

# Tokenize text data on the entire dataset
max_words = 10000  # You can adjust this based on your data and vocabulary size
tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(df['jobdescription'].astype(str))

# Convert text data to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train_text)
X_test_sequences = tokenizer.texts_to_sequences(X_test_text)

# Pad sequences to a fixed length
max_sequence_length = 200  # You can adjust this based on your data and desired sequence length
X_train_sequences = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_sequences = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Create the RNN model
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=max_words, output_dim=128)(input_layer)
lstm_layer = LSTM(128)(embedding_layer)
output_layer = Dense(len(y_train_labels), activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train the model
model.fit(X_train_sequences, y_train_encoded, epochs=1, batch_size=32, validation_split=0.2)

# Evaluate the model with filtered testing data
loss, accuracy = model.evaluate(X_test_sequences, y_test_encoded)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping line 1200: expected 40 fields, saw 49
Skipping line 1201: expected 40 fields, saw 47
Skipping line 1202: expected 40 fields, saw 41
Skipping line 1205: expected 40 fields, saw 47
Skipping line 1206: expected 40 fields, saw 46
Skipping line 1207: expected 40 fields, saw 43
Skipping line 1209: expected 40 fields, saw 68
Skipping line 1210: expected 40 fields, saw 53
Skipping line 1211: expected 40 fields, saw 66
Skipping line 1212: expected 40 fields, saw 43
Skipping line 1214: expected 40 fields, saw 47
Skipping line 1218: expected 40 fields, saw 47
Skipping line 1219: expected 40 fields, saw 41
Skipping line 1223: expected 40 fields, saw 44
Skipping line 1226: expected 40 fields, saw 67
Skipping line 1229: expected 40 fields, saw 62
Skipping line 1230: expected 40 fields, saw 44
Skipping line 1231: expected 40 fields, saw 48
Skipping line 1232: expected 40 fields, saw 43
Skipping line 1235: expected 40 fields, sa

Test Loss: 0.2113, Test Accuracy: 97.66%


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save('/content/drive/MyDrive/datasets/RNN2.h5')

  saving_api.save_model(


In [None]:
from keras.models import load_model

loaded_model = load_model('/content/drive/MyDrive/datasets/RNN2.h5')

In [None]:
import pandas as pd
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# Load the trained RNN model
loaded_model = load_model('/content/drive/MyDrive/datasets/RNN.h5')

# Define a function to make predictions
def predict_company(jobdescription, jobtitle, skills):
    # Combine job description, job title, and skills into a single input text
    input_text = f"{jobdescription} {jobtitle} {skills}"

    # Tokenize and preprocess the input text
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)

    # Use the loaded model to make predictions
    predicted_label_index = loaded_model.predict(input_sequence).argmax()

    # Decode the label using the label_encoder for 'company'
    predicted_company = label_encoder.inverse_transform([predicted_label_index])

    return predicted_company[0]

# Example usage:
jobdescription = "Software Engineer with experience in Python"
jobtitle = "Software Engineer"
skills = "Python, Java, SQL"
predicted_company = predict_company(jobdescription, jobtitle, skills)
print("Predicted Company:", predicted_company)


Predicted Company:  XMAL"
