In [1]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('team.csv', encoding='ISO-8859-1')

# Clean column names by stripping whitespaces
data.columns = data.columns.str.strip()

# Prepare the data
X = data['Job Description']  # Features (Job Descriptions)
y = data['label']  # Target (Labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text data into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Function to predict the label for a new job description
def predict_job_label(description):
    description_tfidf = vectorizer.transform([description])  # Convert input text to TF-IDF features
    prediction = model.predict(description_tfidf)  # Make prediction
    return prediction[0]  # Return the predicted label

# Example usage:
new_description = input("Enter a job description: ")
predicted_label = predict_job_label(new_description)
print(f"Predicted label for the given job description: {predicted_label}")


Classification Report:
               precision    recall  f1-score   support

  Response A       0.74      0.91      0.82        34
  Response B       0.72      0.62      0.67        21
  Response C       0.67      0.25      0.36         8

    accuracy                           0.73        63
   macro avg       0.71      0.59      0.62        63
weighted avg       0.72      0.73      0.71        63

Predicted label for the given job description: Response B


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('team.csv', encoding='ISO-8859-1')

# Clean column names by stripping whitespaces
data.columns = data.columns.str.strip()

# Prepare the data
X = data['Job Description']  # Features (Job Descriptions)
y = data['label']  # Target (Labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text data into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Combine the test set results (Job Description, Original Label, and Predicted Label)
results_df = pd.DataFrame({
    'Job Description': X_test,
    'Original Label': y_test,
    'Predicted Label': y_pred
})

# Display the output (Job Description, Original Label, Predicted Label)
print("\nTest Set Results:\n")
print(results_df)

# Save the results to a CSV file
results_df.to_csv('test_set_predictions.csv', index=False)

# Example usage to predict a label for a new job description:
def predict_job_label(description):
    description_tfidf = vectorizer.transform([description])  # Convert input text to TF-IDF features
    prediction = model.predict(description_tfidf)  # Make prediction
    return prediction[0]  # Return the predicted label

# Predict a new job description (optional)
new_description = input("\nEnter a job description: ")
predicted_label = predict_job_label(new_description)
print(f"\nPredicted label for the given job description: {predicted_label}")


Classification Report:
               precision    recall  f1-score   support

  Response A       0.74      0.91      0.82        34
  Response B       0.72      0.62      0.67        21
  Response C       0.67      0.25      0.36         8

    accuracy                           0.73        63
   macro avg       0.71      0.59      0.62        63
weighted avg       0.72      0.73      0.71        63


Test Set Results:

                                       Job Description Original Label  \
30     Develop interactive data reports and dashboards     Response B   
172  Provide guidance on campaign strategies and co...     Response A   
84     Manage contracts for campus facilities services     Response B   
199  Occasionally assist with training staff on AV ...     Response C   
60                Provide financial advice to students     Response B   
..                                                 ...            ...   
65      Ensure tools and materials are well maintained     Respo

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Uncomment these lines if you haven't downloaded NLTK resources yet
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load the dataset
data = pd.read_csv('team.csv', encoding='ISO-8859-1')

# Clean column names by stripping whitespaces
data.columns = data.columns.str.strip()

# Preprocessing function to clean and lemmatize text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Lemmatize and remove stopwords
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing to job descriptions
data['Job Description'] = data['Job Description'].apply(preprocess_text)

# Prepare the data
X = data['Job Description']  # Features (Job Descriptions)
y = data['label']  # Target (Labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline that includes both TF-IDF Vectorizer and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning using RandomizedSearchCV
param_distributions = {
    'rf__n_estimators': [50, 100, 200, 300],
    'rf__max_depth': [10, 20, 30, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_search.predict(X_test)

# Display classification report
print("Best Parameters Found:\n", random_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Combine the test set results (Job Description, Original Label, and Predicted Label)
results_df = pd.DataFrame({
    'Job Description': X_test,
    'Original Label': y_test,
    'Predicted Label': y_pred
})

# Display the output (Job Description, Original Label, Predicted Label)
print("\nTest Set Results:\n")
print(results_df)

# Save the results to a CSV file
results_df.to_csv('test_set_predictions_with_hyperparameters.csv', index=False)

# Example usage to predict a label for a new job description:
def predict_job_label(description):
    description = preprocess_text(description)  # Preprocess the input
    prediction = random_search.predict([description])  # Make prediction
    return prediction[0]  # Return the predicted label

# Predict a new job description (optional)
new_description = input("\nEnter a job description: ")
predicted_label = predict_job_label(new_description)
print(f"\nPredicted label for the given job description: {predicted_label}")

Best Parameters Found:
 {'rf__n_estimators': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_depth': 30}

Classification Report:
               precision    recall  f1-score   support

  Response A       0.88      0.85      0.87        34
  Response B       0.65      0.81      0.72        21
  Response C       0.75      0.38      0.50         8

    accuracy                           0.78        63
   macro avg       0.76      0.68      0.70        63
weighted avg       0.79      0.78      0.77        63


Test Set Results:

                                       Job Description Original Label  \
30           develop interactive data report dashboard     Response B   
172   provide guidance campaign strategy communication     Response A   
84             manage contract campus facility service     Response B   
199  occasionally assist training staff av equipmen...     Response C   
60                    provide financial advice student     Response B   
..        

In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Uncomment these lines if you haven't downloaded NLTK resources yet
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load the dataset
data = pd.read_csv('team.csv', encoding='ISO-8859-1')

# Clean column names by stripping whitespaces
data.columns = data.columns.str.strip()

# Preprocessing function to clean and lemmatize text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Lemmatize and remove stopwords
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply the preprocessing to job descriptions
data['Job Description'] = data['Job Description'].apply(preprocess_text)

# Prepare the data
X = data['Job Description']  # Features (Job Descriptions)
y = data['label']  # Target (Labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline that includes both TF-IDF Vectorizer and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning using RandomizedSearchCV
param_distributions = {
    'rf__n_estimators': [50, 100, 200, 300],
    'rf__max_depth': [10, 20, 30, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_search.predict(X_test)

# Display classification report
print("Best Parameters Found:\n", random_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Combine the test set results (Job Description, Original Label, and Predicted Label)
results_df = pd.DataFrame({
    'Job Description': X_test,
    'Original Label': y_test,
    'Predicted Label': y_pred
})

# Display the output (Job Description, Original Label, Predicted Label)
print("\nTest Set Results:\n")
print(results_df)

# Save the results to a CSV file
results_df.to_csv('test_set_predictions_with_hyperparameters.csv', index=False)

# Function to predict the label for a new job description
def predict_job_label(description, data, random_search):
    # Preprocess the input description
    processed_description = preprocess_text(description)
    
    # Predict the label using the model
    predicted_label = random_search.predict([processed_description])[0]
    
    # Try to find the original label from the dataset (if it exists)
    original_label = data[data['Job Description'] == processed_description]['label']
    
    if not original_label.empty:
        original_label = original_label.values[0]
    else:
        original_label = "Not available in the dataset"
    
    return original_label, predicted_label

# Example usage:
new_description = input("\nEnter a job description: ")
original_label, predicted_label = predict_job_label(new_description, data, random_search)
print(f"\nExample Job Description: \"{new_description}\"")
print(f"Original Label: {original_label}")
print(f"Predicted Label: {predicted_label}")


Best Parameters Found:
 {'rf__n_estimators': 100, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_depth': 30}

Classification Report:
               precision    recall  f1-score   support

  Response A       0.88      0.85      0.87        34
  Response B       0.65      0.81      0.72        21
  Response C       0.75      0.38      0.50         8

    accuracy                           0.78        63
   macro avg       0.76      0.68      0.70        63
weighted avg       0.79      0.78      0.77        63


Test Set Results:

                                       Job Description Original Label  \
30           develop interactive data report dashboard     Response B   
172   provide guidance campaign strategy communication     Response A   
84             manage contract campus facility service     Response B   
199  occasionally assist training staff av equipmen...     Response C   
60                    provide financial advice student     Response B   
..        

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Uncomment if NLTK resources need to be downloaded
# nltk.download('stopwords')
# nltk.download('wordnet')

# Load the dataset
data = pd.read_csv('team.csv', encoding='ISO-8859-1')
data.columns = data.columns.str.strip()  # Clean column names

# Preprocessing function to clean text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])

# Preprocess job descriptions
data['Job Description'] = data['Job Description'].apply(preprocess_text)

# Train-Test Split
X = data['Job Description']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a TF-IDF + Logistic Regression pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=1000, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Function to predict and compare original and predicted labels
def predict_job_label(description, data, model):
    processed_description = preprocess_text(description)
    predicted_label = model.predict([processed_description])[0]
    original_label = data[data['Job Description'] == processed_description]['label']
    original_label = original_label.values[0] if not original_label.empty else "Not available in dataset"
    return original_label, predicted_label

# Example usage
new_description = input("Enter a job description: ")
original_label, predicted_label = predict_job_label(new_description, data, pipeline)
print(f"\nExample Job Description: \"{new_description}\"")
print(f"Original Label: {original_label}")
print(f"Predicted Label: {predicted_label}")



Example Job Description: "Occasionally guide team members on digital marketing strategies"
Original Label: Response B
Predicted Label: Response B
