In [1]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the CSV data
file_path = 'pastoral.csv'  
pastoral_data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Split the data into features (X) and target (y)
X = pastoral_data['job description']
y = pastoral_data['label']

# Vectorize the job descriptions using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Output the classification report to evaluate performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Function to predict the label for a given job description
def predict_label(job_description):
    # Vectorize the input job description
    job_description_tfidf = vectorizer.transform([job_description])
    
    # Predict the label using the trained model
    predicted_label = model.predict(job_description_tfidf)[0]
    
    return predicted_label

# Example usage
example_job_description = "Manage and maintain sports fields to appropriate standards"
predicted_label = predict_label(example_job_description)

print(f"Predicted Label: {predicted_label}")


Classification Report:
              precision    recall  f1-score   support

  Response A       0.83      0.45      0.59        11
  Response B       0.74      0.94      0.83        18

    accuracy                           0.76        29
   macro avg       0.79      0.70      0.71        29
weighted avg       0.77      0.76      0.74        29

Predicted Label: Response B


In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the CSV data
file_path = 'pastoral.csv' 
pastoral_data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Split the data into features (X) and target (y)
X = pastoral_data['job description']
y = pastoral_data['label']

# Vectorize the job descriptions using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Output the classification report to evaluate performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Function to predict the label for a given job description and display original and predicted labels
def predict_label(job_description, original_label):
    # Vectorize the input job description
    job_description_tfidf = vectorizer.transform([job_description])
    
    # Predict the label using the trained model
    predicted_label = model.predict(job_description_tfidf)[0]
    
    # Output the formatted result
    print(f"Example Job Description: \"{job_description}\"")
    print(f"Original Label: {original_label}")
    print(f"Predicted Label: {predicted_label}")

# Example usage with an example job description
example_job_description = "Manage and maintain sports fields to appropriate standards"
original_label = "Response B"  # Replace this with the actual original label if available

predict_label(example_job_description, original_label)


Classification Report:
              precision    recall  f1-score   support

  Response A       0.83      0.45      0.59        11
  Response B       0.74      0.94      0.83        18

    accuracy                           0.76        29
   macro avg       0.79      0.70      0.71        29
weighted avg       0.77      0.76      0.74        29

Example Job Description: "Manage and maintain sports fields to appropriate standards"
Original Label: Response B
Predicted Label: Response B


In [4]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the CSV data
file_path = 'pastoral.csv'  # Replace with the correct path
pastoral_data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Split the data into features (X) and target (y)
X = pastoral_data['job description']
y = pastoral_data['label']

# Vectorize the job descriptions using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Set up the hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],     # Maximum depth of the trees
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],   # Minimum number of samples required at a leaf node
}

# Initialize a GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,        # 5-fold cross-validation
                           n_jobs=-1,   # Use all available cores
                           verbose=1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Best hyperparameters found by GridSearchCV
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the RandomForestClassifier with the best hyperparameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Evaluate the model using cross-validation to get a more reliable score
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print(f"Cross-validation mean score: {cv_scores.mean()}")

# Predict on the test set
y_pred_best = best_model.predict(X_test)

# Output the classification report
print("Classification Report for Best Model:")
print(classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Cross-validation mean score: 0.6869565217391305
Classification Report for Best Model:
              precision    recall  f1-score   support

  Response A       0.67      0.18      0.29        11
  Response B       0.65      0.94      0.77        18

    accuracy                           0.66        29
   macro avg       0.66      0.56      0.53        29
weighted avg       0.66      0.66      0.59        29



In [7]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Sample job descriptions (use your real dataset here)
job_descriptions = ["Manage and maintain sports fields to appropriate standards",
                    "Provide support and advice on student well-being",
                    "Handle the welfare of patients in a medical setting"]

# Sample corresponding labels for training (use actual labels from your dataset)
labels = ["Response A", "Response B", "Response A"]

# Convert categorical labels to numeric using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

# Vectorize the job descriptions using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(job_descriptions)

# Train a simple XGBoost classifier (assuming a small demo)
model = XGBClassifier(random_state=42)
model.fit(X_tfidf, y_encoded)

# New job description to predict
new_job_description = ["Supervise and ensure proper use of all sports facilities"]

# Vectorize the new job description
new_job_tfidf = vectorizer.transform(new_job_description)

# Predict the label
predicted_label_numeric = model.predict(new_job_tfidf)

# Convert numeric prediction back to original label
predicted_label = label_encoder.inverse_transform(predicted_label_numeric)

# Display the result
print(f"Example Job Description: \"{new_job_description[0]}\"")
print(f"Predicted Label: {predicted_label[0]}")


Example Job Description: "Supervise and ensure proper use of all sports facilities"
Predicted Label: Response A


In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Load the CSV data
file_path = 'pastoral.csv' 
pastoral_data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Split the data into features (X) and target (y)
X = pastoral_data['job description']
y = pastoral_data['label']

# Convert categorical labels to numeric using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Vectorize the job descriptions using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Train a simple XGBoost classifier (you may already have a trained model)
model = XGBClassifier(random_state=42)
model.fit(X_tfidf, y_encoded)

# New job description input (from user)
new_job_description = "Providing support, advice, and information to graduates, including workshops and one-on-one sessions"

# Find the original label from the dataset (if exists)
original_label = pastoral_data[pastoral_data['job description'] == new_job_description]['label'].values
if len(original_label) > 0:
    original_label = original_label[0]
else:
    original_label = "(Original label not found in dataset)"

# Vectorize the new job description
new_job_tfidf = vectorizer.transform([new_job_description])

# Predict the label
predicted_label_numeric = model.predict(new_job_tfidf)

# Convert numeric prediction back to original label
predicted_label = label_encoder.inverse_transform(predicted_label_numeric)

# Display the result in the desired format
print(f"Example Job Description: \"{new_job_description}\"")
print(f"Original Label: {original_label}")
print(f"Predicted Label: {predicted_label[0]}")


Example Job Description: "Providing support, advice, and information to graduates, including workshops and one-on-one sessions"
Original Label: Response B
Predicted Label: Response B
