In [3]:
!pip install pandas openpyxl scikit-learn -q

In [4]:
# Upload Dataset
from google.colab import files
import pandas as pd

print("üìÅ Upload file 'DataFinal-Kelompok 2.xlsx'")
uploaded = files.upload()

# Load data
df = pd.read_excel('DataFinal-Kelompok 2.xlsx')
print(f"\n‚úÖ Data loaded: {len(df)} rows, {len(df.columns)} columns")
print(f"Columns: {list(df.columns)}")
print(f"\nDistribusi KBK:\n{df['KBK'].value_counts()}")

üìÅ Upload file 'DataFinal-Kelompok 2.xlsx'


Saving DataFinal-Kelompok 2.xlsx to DataFinal-Kelompok 2 (2).xlsx

‚úÖ Data loaded: 160 rows, 4 columns
Columns: ['No', 'Program Studi', 'Judul TA Bersih', 'KBK']

Distribusi KBK:
KBK
Software                 40
Jaringan                 40
AI / Machine Learning    40
Animasi                  40
Name: count, dtype: int64


In [5]:
# Train-Test Split & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import re

def preprocess_text(text):
    """Preprocessing minimal untuk konsistensi"""
    text = text.lower()
    text = re.sub(r'na√£¬Øv\s*baiy?', 'naive bayes', text)
    text = re.sub(r'augment\s*realiti', 'augmented reality', text)
    text = re.sub(r'virtual\s*realiti', 'virtual reality', text)
    return text

X = df['Judul TA Bersih'].apply(preprocess_text)
y = df['KBK']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Vectorization
vectorizer = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.8,
    sublinear_tf=True
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vec, y_train)

# Evaluation
train_score = model.score(X_train_vec, y_train)
test_score = model.score(X_test_vec, y_test)

print("=" * 60)
print("üìä EVALUASI MODEL")
print("=" * 60)
print(f"Train Accuracy: {train_score*100:.2f}%")
print(f"Test Accuracy: {test_score*100:.2f}%")
print(f"Overfitting Gap: {(train_score-test_score)*100:.2f}%")

y_pred = model.predict(X_test_vec)
print(f"\nüìã Classification Report:\n")
print(classification_report(y_test, y_pred))

# Cross-validation
X_all_vec = vectorizer.fit_transform(X)
cv_scores = cross_val_score(model, X_all_vec, y, cv=5, scoring='accuracy')
print(f"\nüîÑ Cross-Validation (5-fold):")
print(f"Mean: {cv_scores.mean()*100:.2f}% (¬±{cv_scores.std()*100:.2f}%)")

üìä EVALUASI MODEL
Train Accuracy: 89.06%
Test Accuracy: 50.00%
Overfitting Gap: 39.06%

üìã Classification Report:

                       precision    recall  f1-score   support

AI / Machine Learning       0.80      0.50      0.62         8
              Animasi       0.40      0.50      0.44         8
             Jaringan       0.40      0.50      0.44         8
             Software       0.57      0.50      0.53         8

             accuracy                           0.50        32
            macro avg       0.54      0.50      0.51        32
         weighted avg       0.54      0.50      0.51        32


üîÑ Cross-Validation (5-fold):
Mean: 53.12% (¬±6.56%)


In [6]:
# Train Final Model (100% Data)
print("TRAINING FINAL MODEL (100% DATA)")

# Retrain dengan semua data
X_all_vec = vectorizer.fit_transform(X)
model_final = MultinomialNB(alpha=0.1)
model_final.fit(X_all_vec, y)

final_score = model_final.score(X_all_vec, y)
print(f"Final Model Accuracy: {final_score*100:.2f}%")


üöÄ TRAINING FINAL MODEL (100% DATA)
‚úÖ Final Model Accuracy: 88.75%


In [7]:
# Prediction Function with Keyword Boosting
def predict_kbk(judul):
    """Prediksi KBK dengan keyword boosting"""

    # Keywords per kategori
    keywords = {
        'AI / Machine Learning': ['naive bayes', 'machine learning', 'neural', 'prediksi',
                                   'klasifikasi', 'algoritma', 'knn', 'decision', 'clustering',
                                   'data mining', 'deep learning', 'ai', 'saw', 'ahp', 'smart',
                                   'topsis', 'spk', 'keputusan', 'rekomendasi'],
        'Jaringan': ['jaringan', 'network', 'server', 'mikrotik', 'router', 'firewall',
                     'monitoring', 'iot', 'sensor', 'esp', 'nodemcu', 'mqtt', 'wireless',
                     'wifi', 'keamanan jaringan'],
        'Animasi': ['augmented reality', 'virtual reality', 'ar', 'vr', '3d', 'animasi',
                    'visualisasi', 'ui ux', 'design', 'markerless', 'unity', 'blender',
                    'interaktif', 'media pembelajaran'],
        'Software': ['android', 'mobile', 'web', 'api', 'rest', 'cloud', 'aws', 'docker',
                     'laravel', 'react', 'flutter', 'codeigniter', 'framework', 'database',
                     'crud', 'aplikasi']
    }

    # Preprocess
    judul_clean = preprocess_text(judul)

    # Predict
    X_vec = vectorizer.transform([judul_clean])
    probabilities = model_final.predict_proba(X_vec)[0]

    # Keyword boosting
    keyword_scores = {cat: sum(1 for kw in kws if kw in judul_clean)
                      for cat, kws in keywords.items()}

    boosted_probs = []
    classes = model_final.classes_
    for i, cls in enumerate(classes):
        boost = 1 + (keyword_scores.get(cls, 0) * 0.15)
        boosted_probs.append(probabilities[i] * boost)

    # Normalize
    total = sum(boosted_probs)
    boosted_probs = [p / total for p in boosted_probs]

    # Result
    prediction = classes[np.argmax(boosted_probs)]
    prob_dict = {classes[i]: boosted_probs[i] for i in range(len(classes))}

    return prediction, prob_dict

In [8]:
# Test Predictions
print("TEST PREDIKSI")

test_cases = [
    "prediksi kelulusan mahasiswa menggunakan algoritma naive bayes",
    "monitoring jaringan dengan mikrotik dan firewall",
    "media pembelajaran augmented reality pengenalan komponen komputer",
    "aplikasi mobile android untuk sistem informasi"
]

for judul in test_cases:
    pred, probs = predict_kbk(judul)
    print(f"\nJudul: {judul}")
    print(f"Prediksi: {pred}")
    print("Probabilitas:")
    for cat, prob in sorted(probs.items(), key=lambda x: x[1], reverse=True):
        print(f"   {cat}: {prob*100:.1f}%")


üß™ TEST PREDIKSI

üìù Judul: prediksi kelulusan mahasiswa menggunakan algoritma naive bayes
‚úÖ Prediksi: AI / Machine Learning
üìä Probabilitas:
   AI / Machine Learning: 92.3%
   Software: 4.1%
   Animasi: 2.7%
   Jaringan: 0.9%

üìù Judul: monitoring jaringan dengan mikrotik dan firewall
‚úÖ Prediksi: Jaringan
üìä Probabilitas:
   Jaringan: 67.8%
   Animasi: 20.8%
   Software: 6.2%
   AI / Machine Learning: 5.2%

üìù Judul: media pembelajaran augmented reality pengenalan komponen komputer
‚úÖ Prediksi: Animasi
üìä Probabilitas:
   Animasi: 96.7%
   Software: 3.1%
   AI / Machine Learning: 0.1%
   Jaringan: 0.1%

üìù Judul: aplikasi mobile android untuk sistem informasi
‚úÖ Prediksi: Software
üìä Probabilitas:
   Software: 66.2%
   Animasi: 26.5%
   Jaringan: 3.9%
   AI / Machine Learning: 3.4%


In [9]:
# Save Model (Optional)
import pickle

# Save model dan vectorizer
with open('model.pkl', 'wb') as f:
    pickle.dump(model_final, f)
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("\n Model dan vectorizer disimpan!")
print("Download file:")

# Download files
files.download('model.pkl')
files.download('vectorizer.pkl')

print("\n SELESAI! Model siap digunakan.")


‚úÖ Model dan vectorizer disimpan!
üì• Download file:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


üéâ SELESAI! Model siap digunakan.
