In [1]:
pip install pandas scikit-learn joblib

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import joblib

# Load the CSV file
df = pd.read_csv('data.csv', encoding='UTF-8-SIG')

# Preprocess the data
X = df['Code.1']
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define the models and parameters for GridSearchCV
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'RandomForest': RandomForestClassifier()
}

params = {
    'LogisticRegression': {'C': [0.1, 1, 10]},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'RandomForest': {'n_estimators': [50, 100, 200]}
}

best_models = {}

# Perform GridSearchCV for each model
for model_name in models:
    grid = GridSearchCV(models[model_name], params[model_name], cv=5, scoring='accuracy')
    grid.fit(X_train_tfidf, y_train)
    best_models[model_name] = grid.best_estimator_
    print(model_name + ' best params: ' + str(grid.best_params_))
    print(model_name + ' best score: ' + str(grid.best_score_))

# Evaluate the best models on the test set
for model_name in best_models:
    y_pred = best_models[model_name].predict(X_test_tfidf)
    print(model_name + ' classification report:')
    print(classification_report(y_test, y_pred))

# Save the best model (Random Forest)
best_model = best_models['RandomForest']
joblib.dump(best_model, 'best_model_random_forest.pkl')
print('Best model saved as best_model_random_forest.pkl')

# Analyze misclassifications
misclassified = X_test[y_test != y_pred]
misclassified_labels = y_test[y_test != y_pred]
misclassified_preds = y_pred[y_test != y_pred]

# Display misclassified examples
for i in range(len(misclassified)):
    print('Job Description:', misclassified.iloc[i])
    print('True Label:', misclassified_labels.iloc[i])
    print('Predicted Label:', misclassified_preds[i])
    print('---')



LogisticRegression best params: {'C': 10}
LogisticRegression best score: 0.8943961352657004
SVC best params: {'C': 10, 'kernel': 'linear'}
SVC best score: 0.9121739130434783




RandomForest best params: {'n_estimators': 50}
RandomForest best score: 0.8679227053140097
LogisticRegression classification report:
              precision    recall  f1-score   support

  Response B       0.73      0.89      0.80        18
  Response C       0.93      0.87      0.90        30
  Response D       1.00      0.78      0.88         9

    accuracy                           0.86        57
   macro avg       0.89      0.84      0.86        57
weighted avg       0.88      0.86      0.86        57

SVC classification report:
              precision    recall  f1-score   support

  Response B       0.73      0.89      0.80        18
  Response C       0.92      0.80      0.86        30
  Response D       0.78      0.78      0.78         9

    accuracy                           0.82        57
   macro avg       0.81      0.82      0.81        57
weighted avg       0.84      0.82      0.83        57

RandomForest classification report:
              precision    recall  f1-scor