In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
import numpy as np

In [2]:
df = pd.read_csv('postings.csv')

In [3]:
df_filtered = df[['normalized_salary', 'formatted_work_type']].dropna().copy()

In [4]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['normalized_salary']),
    ('cat', OneHotEncoder(), ['formatted_work_type'])
])

In [7]:
X_processed = preprocessor.fit_transform(df_filtered)
if hasattr(X_processed, "toarray"):
    X_processed = X_processed.toarray()

In [8]:
kmeans = KMeans(n_clusters=3, random_state=42)
df_filtered['Cluster_KMeans'] = kmeans.fit_predict(X_processed)

In [9]:
dbscan = DBSCAN(eps=0.5, min_samples=5)
df_filtered['Cluster_DBSCAN'] = dbscan.fit_predict(X_processed)
df_filtered['Outlier_Flag'] = (df_filtered['Cluster_DBSCAN'] == -1).astype(int)

In [10]:
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(X_processed)
df_filtered['Cluster_GMM'] = gmm.predict(X_processed)
df_filtered['Cluster_Prob_GMM'] = gmm.predict_proba(X_processed).max(axis=1)

In [11]:
print(df_filtered.head())


   normalized_salary formatted_work_type  Cluster_KMeans  Cluster_DBSCAN  \
0            38480.0           Full-time               0               0   
1            55000.0           Full-time               0               0   
2           157500.0           Full-time               0               0   
3            35360.0          Internship               2               1   
4           180000.0            Contract               2               2   

   Outlier_Flag  Cluster_GMM  Cluster_Prob_GMM  
0             0            0          0.999991  
1             0            0          0.999993  
2             0            0          0.999993  
3             0            2          1.000000  
4             0            2          1.000000  


In [12]:
# 1. Kết hợp các cột mới vào dữ liệu gốc
df_augmented = df.copy()  # Tạo bản sao của df gốc

# 2. Thêm các cột từ kết quả phân cụm vào df gốc
df_augmented['Cluster_KMeans'] = df_filtered['Cluster_KMeans']
df_augmented['Cluster_DBSCAN'] = df_filtered['Cluster_DBSCAN']
df_augmented['Outlier_Flag'] = df_filtered['Outlier_Flag']
df_augmented['Cluster_GMM'] = df_filtered['Cluster_GMM']
df_augmented['Cluster_Prob_GMM'] = df_filtered['Cluster_Prob_GMM']

# 3. Kiểm tra lại dữ liệu đã được kết hợp
print(df_augmented.head(10))

      job_id                                  company_name  \
0     921716                         Corcoran Sawyer Smith   
1   10998357                        The National Exemplar    
2   23221523                        Abrams Fensterman, LLP   
3   91700727                     Downtown Raleigh Alliance   
4  103254301                                    Raw Cereal   
5  266566927                            Revesco Properties   
6  266825034                            Recruitment Design   
7  280496925                   Washington State University   
8  445337908                           Food Bank of Alaska   
9  606178500  Jung & Vassar PC Attorneys At Law A Law Corp   

                                               title  max_salary pay_period  \
0                              Marketing Coordinator        20.0     HOURLY   
1                        Assitant Restaurant Manager     65000.0     YEARLY   
2  Senior Elder Law / Trusts and Estates Associat...    175000.0     YEARLY   
3