# NLP Baseline Training – Work Quality Index (WQI)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import cohen_kappa_score, mean_absolute_error
import numpy as np
import joblib
import os

path = os.path.join('..','data','sample_ckp_labeled.csv')
df = pd.read_csv(path)
df['label_wqi_bucket'] = pd.cut(df['WQI'], bins=[-1,20,40,60,80,100], labels=[0,1,2,3,4]).astype(int)
X_text = df['uraian_teks'].fillna('')
y = df['label_wqi_bucket']

tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X = tfidf.fit_transform(X_text)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=200)
clf.fit(X_tr, y_tr)
pred = clf.predict(X_te)
qwk = cohen_kappa_score(y_te, pred, weights='quadratic')
print('Quadratic Weighted Kappa:', round(qwk, 3))
bucket_to_score = {0:10,1:30,2:50,3:70,4:90}
joblib.dump({'vectorizer': tfidf, 'model': clf, 'bucket_to_score': bucket_to_score}, os.path.join('..','backend','models','nlp_wqi_baseline.joblib'))
print('Model saved.')
