In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score

In [60]:
# Read the entire dataset
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


data = pd.read_csv('dataset.csv')
df_filtered = data[['normalized_salary', 'formatted_work_type']].dropna().copy()

column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), ['formatted_work_type'])  # Replace with your categorical column names
    ],
    remainder='passthrough'  # Keep other columns as is
)

# Transform the data
df_encoded = column_transformer.fit_transform(df_filtered)

# Convert sparse matrix to dense
df_encoded = df_encoded.toarray()

# Convert to DataFrame if needed
df_encoded = pd.DataFrame(df_encoded)

print(df_encoded.head())


     0    1    2    3    4    5    6         7
0  0.0  1.0  0.0  0.0  0.0  0.0  0.0   38480.0
1  0.0  1.0  0.0  0.0  0.0  0.0  0.0   55000.0
2  0.0  1.0  0.0  0.0  0.0  0.0  0.0  157500.0
3  0.0  0.0  1.0  0.0  0.0  0.0  0.0   35360.0
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  180000.0


In [62]:
# Step 2: reduce demensions using PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
data = pca.fit_transform(df_encoded)


In [64]:
# step 3.1: unsupervised learning k-means clustering return cluster labels
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)  # Choose the number of clusters
kmeans.fit(data)

# Add cluster labels to the original DataFrame
df_filtered['cluster'] = kmeans.labels_

print(df_filtered.head())


   normalized_salary formatted_work_type  cluster
0            38480.0           Full-time        0
1            55000.0           Full-time        0
2           157500.0           Full-time        0
3            35360.0          Internship        0
4           180000.0            Contract        0


In [56]:
# step 3.2. unsupervised learning DBSCAN clustering return cluster labels, outliers flag
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust parameters as needed

dbscan.fit(data)

# Add cluster labels and outliers flag to the original DataFrame

df_filtered['dbscan_cluster'] = dbscan.labels_
df_filtered['outlier'] = df_filtered['dbscan_cluster'] == -1  # Mark outliers as True


In [57]:
# Step 3.3. unsupervised learning gaussian mixture models return probability features

from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3, random_state=42)  # Choose the number of components

gmm.fit(data)

# Get the probabilities for each component
probs = gmm.predict_proba(data)

# Add probabilities to the original DataFrame
for i in range(probs.shape[1]):
    df_filtered[f'gmm_prob_{i}'] = probs[:, i]
    
# Print the first few rows of the DataFrame with GMM probabilities
print(df_filtered.head())




   normalized_salary formatted_work_type  cluster  dbscan_cluster  outlier  \
0            38480.0           Full-time        0               0    False   
1            55000.0           Full-time        0               1    False   
2           157500.0           Full-time        0               2    False   
3            35360.0          Internship        0               3    False   
4           180000.0            Contract        0               4    False   

   gmm_prob_0  gmm_prob_1  gmm_prob_2  
0    0.999988    0.000012         0.0  
1    0.999991    0.000009         0.0  
2    0.999989    0.000011         0.0  
3    0.999988    0.000012         0.0  
4    0.999982    0.000018         0.0  


In [None]:
# step 4: augment (Dữ liệu đã giảm chiều + cluster info (đặc trưng mới))

df_augmented = df_filtered.copy()
df_augmented = df_augmented.drop(columns=['normalized_salary', 'formatted_work_type'])  # Drop original columns if needed
print(df_augmented.head())


Accuracy: 0.9996591683708248
Precision: 0.9993184529078492
Recall: 0.9996591683708248
F1 Score: 0.9994887816027371
Confusion Matrix:
 [[5866    0    0]
 [   1    0    0]
 [   1    0    0]]
   cluster  dbscan_cluster  outlier  gmm_prob_0  gmm_prob_1  gmm_prob_2
0        0               0    False    0.999988    0.000012         0.0
1        0               1    False    0.999991    0.000009         0.0
2        0               2    False    0.999989    0.000011         0.0
3        0               3    False    0.999988    0.000012         0.0
4        0               4    False    0.999982    0.000018         0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [70]:
# step 5: supervised learning (train-test split, scale, train, predict, evaluate)
from sklearn.metrics import classification_report


X = df_augmented.drop(columns=['cluster'])  # Features
y = df_augmented['cluster']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = clf.predict(X_test_scaled)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Print the first few rows of the augmented DataFrame

Accuracy: 0.9996591683708248
Precision: 0.9993184529078492
Recall: 0.9996591683708248
F1 Score: 0.9994887816027371
Confusion Matrix:
 [[5866    0    0]
 [   1    0    0]
 [   1    0    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
