In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import joblib
import logging

# Setup logging
logging.basicConfig(filename='svm_training.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Load data
df = pd.read_excel("data_dummy_BLT_DD.xlsx")

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Jenis Pekerjaan', 'Riwayat Penyakit Kronis', 'Tingkat Pendidikan', 'Status Kepemilikan Rumah',
                       'Kehilangan Mata Pencaharian', 'Anggota Keluarga Rentan', 'Tidak Menerima Bantuan Sosial',
                       'Rumah Tangga Lansia Tunggal', 'Perempuan Kepala Keluarga']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Prepare features and target
X = df.drop(columns=['Nama Keluarga', 'Alamat', 'Kabupaten', 'Kecamatan'])
y = df['Perempuan Kepala Keluarga']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM model
model = SVC(kernel='linear', verbose=True)
model.fit(X_train, y_train)

# Logging the training process
logging.info(f"Model training complete with parameters: {model.get_params()}")
logging.info(f"Training accuracy: {accuracy_score(y_train, model.predict(X_train))}")
logging.info(f"Test accuracy: {accuracy_score(y_test, model.predict(X_test))}")
logging.info(f"Classification report:\n {classification_report(y_test, model.predict(X_test))}")

# Save the model
joblib.dump(model, 'svm_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Visualize the hyperplane
plt.figure(figsize=(10, 6))

# Reduce to 2D for visualization
X_train_2d = X_train[:, :2]
model_2d = SVC(kernel='linear')
model_2d.fit(X_train_2d, y_train)

# Plot the points
plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=y_train, cmap='winter', s=50)

# Plot the decision boundary
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = model_2d.decision_function(xy).reshape(XX.shape)

ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
ax.scatter(model_2d.support_vectors_[:, 0], model_2d.support_vectors_[:, 1], s=100, linewidth=1, facecolors='none', edgecolors='k')

plt.title('SVM Decision Boundary with Support Vectors')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.savefig('svm_decision_boundary.png')
plt.show()

# File paths for logging and saved models
log_file_path = 'svm_training.log'
model_file_path = 'svm_model.pkl'
scaler_file_path = 'scaler.pkl'
visualization_file_path = 'svm_decision_boundary.png'

log_file_path, model_file_path, scaler_file_path, visualization_file_path


FileNotFoundError: [Errno 2] No such file or directory: 'data_dummy_BLT_DD.xlsx'