In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score

In [2]:
# step 1: load the dataset
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


data = pd.read_csv('dataset.csv')

df_filtered = data[['normalized_salary', 'formatted_work_type']].dropna().copy()

# Encode categorical columns
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), ['formatted_work_type'])  # Replace with your categorical column names
    ],
    remainder='passthrough'  # Keep other columns as is
)

# Transform the data
df_encoded = column_transformer.fit_transform(df_filtered)

df_encoded = df_encoded.toarray()

# Convert to DataFrame if needed
df_encoded = pd.DataFrame(df_encoded)

print(df_filtered.head())
print(df_encoded.head())    



   normalized_salary formatted_work_type
0            38480.0           Full-time
1            55000.0           Full-time
2           157500.0           Full-time
3            35360.0          Internship
4           180000.0            Contract
     0    1    2    3    4    5    6         7
0  0.0  1.0  0.0  0.0  0.0  0.0  0.0   38480.0
1  0.0  1.0  0.0  0.0  0.0  0.0  0.0   55000.0
2  0.0  1.0  0.0  0.0  0.0  0.0  0.0  157500.0
3  0.0  0.0  1.0  0.0  0.0  0.0  0.0   35360.0
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  180000.0


In [None]:
# Step 2: apply supervised learning decision tree classifier
from sklearn.metrics import classification_report

features = 'formatted_work_type'  # Replace with your feature column names
target = 'normalize_salary'  # Replace with your target column name

X = df_filtered[[features]]  # Features (all columns except the last one)
y = df_filtered[target]  # Target (last column)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.values.reshape(-1, 1))
X_test = scaler.transform(X_test.values.reshape(-1, 1))

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
clf.fit(X_train, y_train)

# make predictions on the test set
y_pred = clf.predict(X_test)

# evaluate the model

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_report(y_test, y_pred))

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [54]:
# Step 2: apply supervised learning SVM classifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

features = 'normalized_salary'  # Replace with your feature column names
target = 'formatted_work_type'  # Replace with your target column name

X = df_filtered[[features]]  # Features (all columns except the last one)
y = df_filtered[target]  # Target (last column)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.values.reshape(-1, 1))
X_test = scaler.transform(X_test.values.reshape(-1, 1))

# Initialize the Decision Tree Classifier
clf = SGDClassifier(random_state=42)

# Fit the model to the training data
clf.fit(X_train, y_train)

# make predictions on the test set
y_pred = clf.predict(X_test)

# evaluate the model

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8249829584185412
Precision: 0.6805968816810085
Recall: 0.8249829584185412
F1 Score: 0.745866561154946
Confusion Matrix:
 [[   0  618    0    0    0    0]
 [   0 4841    0    0    0    0]
 [   0   27    0    0    0    0]
 [   0   17    0    0    0    0]
 [   0  309    0    0    0    0]
 [   0   56    0    0    0    0]]
Classification Report:
               precision    recall  f1-score   support

    Contract       0.00      0.00      0.00       618
   Full-time       0.82      1.00      0.90      4841
  Internship       0.00      0.00      0.00        27
       Other       0.00      0.00      0.00        17
   Part-time       0.00      0.00      0.00       309
   Temporary       0.00      0.00      0.00        56

    accuracy                           0.82      5868
   macro avg       0.14      0.17      0.15      5868
weighted avg       0.68      0.82      0.75      5868



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
# Step 2: Apply supervised learning with Ensemble Learning (Random Forest Classifier)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define features and target
features = 'normalized_salary'  # Replace with your feature column names
target = 'formatted_work_type'  # Replace with your target column name

X = df_filtered[[features]]  # Features (ensure it's a DataFrame)
y = df_filtered[target]  # Target (last column)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8350374914792092
Precision: 0.8082050395546809
Recall: 0.8350374914792092
F1 Score: 0.8184952283954067
Confusion Matrix:
 [[ 308  300    0    0    9    1]
 [ 203 4534    4    8   83    9]
 [   2   23    2    0    0    0]
 [   0   13    0    3    1    0]
 [  18  237    0    0   52    2]
 [  17   34    0    0    4    1]]
Classification Report:
               precision    recall  f1-score   support

    Contract       0.56      0.50      0.53       618
   Full-time       0.88      0.94      0.91      4841
  Internship       0.33      0.07      0.12        27
       Other       0.27      0.18      0.21        17
   Part-time       0.35      0.17      0.23       309
   Temporary       0.08      0.02      0.03        56

    accuracy                           0.84      5868
   macro avg       0.41      0.31      0.34      5868
weighted avg       0.81      0.84      0.82      5868

