In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
import networkx as nx
from node2vec import Node2Vec
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise import accuracy as surprise_accuracy
from sklearn.ensemble import RandomForestClassifier  # Added import


In [2]:
# Load the CSV file
file_path = 'employee_skills_ratings_V3.csv'  # Change this to your local file path
df = pd.read_csv(file_path)

# Encode the department labels to integers
label_encoder = LabelEncoder()
df['Department'] = label_encoder.fit_transform(df['Department'])

# Split the data into features and target
X = df.drop(columns=['Employee ID', 'Department'])
y = df['Department']


In [3]:

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:

### Knowledge Graph Approach
# Create a knowledge graph
G = nx.Graph()

# Add nodes and edges based on the CSV data
for index, row in df.iterrows():
    employee_id = row['Employee ID']
    department = row['Department']
    skills = row.drop(['Employee ID', 'Department'])

    # Add employee node
    G.add_node(employee_id, type='employee')

    # Add department node and edge
    G.add_node(department, type='department')
    G.add_edge(employee_id, department, weight=1)

    # Add skill nodes and edges
    for skill, rating in skills.items():
        if rating > 0:
            G.add_node(skill, type='skill')
            G.add_edge(employee_id, skill, weight=rating)

# Generate embeddings using Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Create feature vectors for employees based on embeddings
X_train_emb = np.array([model.wv[str(id)] for id in X_train.index])
X_test_emb = np.array([model.wv[str(id)] for id in X_test.index])

# Train a classifier on the embeddings
rf_clf = RandomForestClassifier()  # RandomForestClassifier import added here
rf_clf.fit(X_train_emb, y_train)

# Predict and evaluate
y_pred_emb = rf_clf.predict(X_test_emb)
kg_accuracy = accuracy_score(y_test, y_pred_emb)
print(f"Knowledge Graph Accuracy: {kg_accuracy:.2f}")


Computing transition probabilities:   0%|          | 0/180 [00:00<?, ?it/s]

Knowledge Graph Accuracy: 0.23


In [5]:

### Decision Tree
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_dt = dt_clf.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy:.2f}")


Decision Tree Accuracy: 1.00


In [6]:

### Gradient Boosting Machine (GBM)
gbm_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
gbm_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_gbm = gbm_clf.predict(X_test)
gbm_accuracy = accuracy_score(y_test, y_pred_gbm)
print(f"GBM Accuracy: {gbm_accuracy:.2f}")


GBM Accuracy: 1.00


In [7]:
### Collaborative Filtering
# Prepare the data for Surprise
melted_df = df.melt(id_vars=['Employee ID'], value_vars=[col for col in df.columns if col not in ['Employee ID', 'Department']],
                    var_name='Skill', value_name='Rating')

display(melted_df)
# Filter out rows where the rating is 0 (assuming ratings of 0 should not be included in the collaborative filtering)
melted_df = melted_df[melted_df['Rating'] > 0]

# Define the reader object with appropriate rating scale
reader = Reader(rating_scale=(1, 10))  # Adjust the rating scale if your data has different range

# Load the data into Surprise's Dataset format
data = Dataset.load_from_df(melted_df[['Employee ID', 'Skill', 'Rating']], reader)

# Split the data into training and testing sets
trainset, testset = surprise_train_test_split(data, test_size=0.2)

# Train a collaborative filtering model (SVD)
svd = SVD()
svd.fit(trainset)

# Predict and evaluate
predictions = svd.test(testset)
cf_rmse = surprise_accuracy.rmse(predictions)
print(f"Collaborative Filtering RMSE: {cf_rmse:.2f}")

Unnamed: 0,Employee ID,Skill,Rating
0,1,ML,0
1,2,ML,0
2,3,ML,0
3,4,ML,0
4,5,ML,0
...,...,...,...
4345,146,Investment Analysis,0
4346,147,Investment Analysis,0
4347,148,Investment Analysis,0
4348,149,Investment Analysis,0


RMSE: 2.8587
Collaborative Filtering RMSE: 2.86


In [8]:

### Content-Based Filtering
similarity = cosine_similarity(X_train)

# Train a KNN classifier based on similarity
knn_clf = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn_clf.predict(X_test)
cb_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"Content-Based Filtering Accuracy: {cb_accuracy:.2f}")


Content-Based Filtering Accuracy: 1.00


In [9]:

### Performance Comparison
print("\nPerformance Comparison:")
print(f"Knowledge Graph Accuracy: {kg_accuracy:.2f}")
print(f"Decision Tree Accuracy: {dt_accuracy:.2f}")
print(f"GBM Accuracy: {gbm_accuracy:.2f}")
print(f"Collaborative Filtering RMSE: {cf_rmse:.2f}")
print(f"Content-Based Filtering Accuracy: {cb_accuracy:.2f}")



Performance Comparison:
Knowledge Graph Accuracy: 0.23
Decision Tree Accuracy: 1.00
GBM Accuracy: 1.00
Collaborative Filtering RMSE: 2.86
Content-Based Filtering Accuracy: 1.00
