In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data from the provided Excel file
data_path = "data.xlsx"
data_df = pd.read_excel(data_path)

# Encode 'Jenis Kelamin' [Laki-laki = 0, Perempuan = 1]
gender_encoder = LabelEncoder()
data_df['Jenis Kelamin'] = gender_encoder.fit_transform(data_df['Jenis Kelamin'])

# Encode 'Status Gizi' as the target variable for multi-class classification
status_encoder = LabelEncoder()
data_df['Status Gizi'] = status_encoder.fit_transform(data_df['Status Gizi'])

# Store the original labels for later use
original_labels = status_encoder.classes_

# Filter the relevant columns for features and labels
X = data_df[['Jenis Kelamin', 'Berat Badan Saat Lahir (kg)', 'Tinggi Badan Saat Lahir (cm)', 
             'Berat Badan Saat Ini (kg)', 'Tinggi Badan Saat Ini (cm)', 'Usia (bulan)', 
             'Z-Score Berat Badan', 'Z-Score Tinggi Badan']]
y = data_df['Status Gizi']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline that scales the features and then applies SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.1, 1],
    'svm__kernel': ['linear', 'rbf', 'poly']
}

# Use StratifiedKFold with n_splits=3 (or any number <= the smallest class size)
cv_strategy = StratifiedKFold(n_splits=3)

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=cv_strategy, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Ensure all original labels are included, even if they don't appear in the test set
unique_labels = np.unique(np.concatenate([y_test, y_pred]))

# Update the target names based on the unique labels
actual_target_names = original_labels

# Print the classification report
classification_rep = classification_report(y_test, y_pred, labels=unique_labels, target_names=actual_target_names)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=unique_labels)

# Print results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
print(f"\nTest accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=actual_target_names, yticklabels=actual_target_names)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

# Save the best model and encoders to a file using pickle
model_path = "svm_model.pkl"
with open(model_path, 'wb') as model_file:
    pickle.dump({
        'model': best_model,
        'gender_encoder': gender_encoder,
        'status_encoder': status_encoder
    }, model_file)

print(f"\nBest model and encoders have been saved to {model_path}")

# Feature importance (for SVM, we'll use the absolute values of the coefficients for linear kernel)
if best_model.named_steps['svm'].kernel == 'linear':
    feature_importance = np.abs(best_model.named_steps['svm'].coef_[0])
    feature_names = X.columns
    feature_importance_dict = dict(zip(feature_names, feature_importance))
    sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
    
    print("\nFeature Importance:")
    for feature, importance in sorted_features:
        print(f"{feature}: {importance:.4f}")

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.bar(feature_names, feature_importance)
    plt.title('Feature Importance')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
else:
    print("\nFeature importance is not available for non-linear kernels.")

print("\nTraining and evaluation completed. Check the output files for detailed results.")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from IPython.display import display
import pickle

# Define function to calculate Z-score
def calculate_z_score(value, mean, std_dev):
    return (value - mean) / std_dev

# Function to classify Z-score for height
def classify_z_score_height(z_score):
    if z_score < -3:
        return "Sangat Pendek"
    elif -3 <= z_score < -2:
        return "Pendek"
    elif -2 <= z_score <= 2:
        return "Normal"
    else:
        return "Tinggi"

# Function to classify Z-score for weight
def classify_z_score_weight(z_score):
    if z_score < -3:
        return "Gizi Buruk"
    elif -3 <= z_score < -2:
        return "Gizi Kurang"
    elif -2 <= z_score <= 1:
        return "Gizi Baik"
    elif 1 < z_score <= 2:
        return "Berpotensi Berlebihan"
    elif 2 < z_score <= 3:
        return "Gizi Lebih"
    else:
        return "Obesitas"

# Function to generate dummy data with the specified columns
def generate_modified_dummy_data(num_samples):
    np.random.seed(0)  # For reproducibility

    # Generate age and height data
    ages = np.random.randint(0, 60, size=num_samples)
    mean_height = [49.9, 54.7, 58.4, 61.4, 63.9, 65.9, 67.6, 68.6, 69.6, 70.6, 71.6, 72.6, 73.6, 74.6, 75.6, 76.6, 77.6, 78.6, 79.6, 80.6, 81.6, 82.6, 83.6, 84.6, 85.6, 86.6, 87.6, 88.6, 89.6, 90.6, 91.6, 92.6, 93.6, 94.6, 95.6, 96.6, 97.6, 98.6, 99.6, 100.6, 101.6, 102.6, 103.6, 104.6, 105.6, 106.6, 107.6, 108.6, 109.6, 110.6, 111.6, 112.6, 113.6, 114.6, 115.6, 116.6, 117.6, 118.6, 119.6, 120.6, 121.6, 122.6]
    std_dev_height = [3.8, 4.4, 4.9, 5.3, 5.6, 5.9, 6.2, 6.4, 6.7, 6.9, 7.1, 7.3, 7.5, 7.7, 7.9, 8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3, 9.5, 9.7, 9.9, 10.1, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.5, 11.7, 11.9, 12.1, 12.3, 12.5, 12.7, 12.9, 13.1, 13.3, 13.5, 13.7, 13.9, 14.1, 14.3, 14.5, 14.7, 14.9, 15.1, 15.3, 15.5, 15.7, 15.9, 16.1, 16.3, 16.5, 16.7, 16.9, 17.1, 17.3]

    mean_weight = [3.3, 4.5, 5.6, 6.4, 7.0, 7.5, 7.9, 8.3, 8.6, 8.9, 9.2, 9.4, 9.6, 9.9, 10.1, 10.3, 10.5, 10.7, 10.9, 11.1, 11.3, 11.5, 11.7, 11.9, 12.2, 12.4, 12.5, 12.7, 12.9, 13.1, 13.3, 13.5, 13.7, 13.9, 14.1, 14.3, 14.5, 14.7, 14.9, 15.1, 15.3, 15.5, 15.7, 15.9, 16.1, 16.3, 16.5, 16.7, 16.9, 17.1, 17.3, 17.5, 17.7, 17.9, 18.1, 18.3, 18.5, 18.7, 18.9, 19.1, 19.3, 19.5]
    std_dev_weight = [0.5, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7]

    # Generate weight data with variations for underweight
    weights = []
    for age in ages:
        if np.random.rand() > 0.43:  # 30% chance of underweight
            weight = np.random.normal(mean_weight[age] - 1.5 * std_dev_weight[age], std_dev_weight[age])
        else:
            weight = np.random.normal(mean_weight[age], std_dev_weight[age])
        weights.append(weight)

    # Generate height data
    heights = [np.random.normal(mean_height[age], std_dev_height[age]) for age in ages]

    # Calculate Z-scores for height and weight
    z_scores_height = [calculate_z_score(height, mean_height[age], std_dev_height[age]) for height, age in zip(heights, ages)]
    z_scores_weight = [calculate_z_score(weight, mean_weight[age], std_dev_weight[age]) for weight, age in zip(weights, ages)]

    # Generate gender data
    genders = np.random.choice(['Laki-laki', 'Perempuan'], num_samples)

    # Generate birth weight and height data
    birth_weights = np.random.normal(3.5, 0.5, num_samples)
    birth_heights = np.random.normal(50, 2, num_samples)

    # Create a DataFrame
    data = {
        "Nama": [f'Child_{i+1}' for i in range(num_samples)],
        "Jenis Kelamin": genders,
        "Berat Badan Saat Lahir (kg)": birth_weights,
        "Tinggi Badan Saat Lahir (cm)": birth_heights,
        "Berat Badan Saat Ini (kg)": weights,
        "Tinggi Badan Saat Ini (cm)": heights,
        "Usia (bulan)": ages,
        "Z-Score Berat Badan": z_scores_weight,
        "Z-Score Tinggi Badan": z_scores_height,
        "Klasifikasi Z score-TB": [classify_z_score_height(z) for z in z_scores_height],
        "Klasifikasi Z score-BB": [classify_z_score_weight(z) for z in z_scores_weight]
    }

    df = pd.DataFrame(data)

    # Drop 'Status Gizi' to simulate test data where we don't know the actual nutritional status
    return df

# Function to validate predictions
def validate_predictions(predictions, valid_classes):
    invalid_indices = np.where(~np.isin(predictions, valid_classes))[0]
    if len(invalid_indices) > 0:
        print(f"Warning: Found {len(invalid_indices)} invalid predictions.")
        predictions[invalid_indices] = np.random.choice(valid_classes)  # Replace with a random valid class
    return predictions

# Define valid classes
valid_classes = np.array([0, 1, 2, 3, 4])

# Load the saved model and scaler
# Load the saved model and scaler
model_path = "svm_model.pkl"
scaler_path = "scaler.pkl"
with open(model_path, 'rb') as model_file, open(scaler_path, 'rb') as scaler_file:
    saved_data = pickle.load(model_file)  # Load the dictionary
    model = saved_data['model']  # Access the model from the dictionary
    scaler = pickle.load(scaler_file)

# Generate the testing data
test_data_df = generate_modified_dummy_data(200)  # Generate 20 samples for testing

# Display the testing data
display(test_data_df)

# Encode 'Jenis Kelamin' [Laki-laki = 0, Perempuan = 1] using LabelEncoder
label_encoder = LabelEncoder()
test_data_df['Jenis Kelamin'] = label_encoder.fit_transform(test_data_df['Jenis Kelamin'])

# Preprocess the testing data in the same way as the training data
test_X = test_data_df[['Jenis Kelamin', 'Berat Badan Saat Lahir (kg)', 
                       'Tinggi Badan Saat Lahir (cm)', 'Berat Badan Saat Ini (kg)', 
                       'Tinggi Badan Saat Ini (cm)', 'Usia (bulan)', 
                       'Z-Score Berat Badan', 'Z-Score Tinggi Badan']].values

# Transform the test data using the same scaler used during training
test_X = scaler.transform(test_X)

# Make predictions on the testing data
test_predictions = model.predict(test_X)

# Convert predictions to integer type
test_predictions = test_predictions.astype(int)


# Validate predictions to ensure they only contain valid classes
test_predictions = validate_predictions(test_predictions, valid_classes)

# Specify the actual class names
label_encoder.classes_ = np.array(['Gizi Buruk', 'Gizi Kurang', 'Gizi Baik', 'Gizi Lebih', 'Obesitas'])

# Map predictions back to class labels
test_data_df['Prediksi Status Gizi'] = label_encoder.inverse_transform(test_predictions)

# Display the prediction results
display(test_data_df)

# Save the predictions to a new Excel file
test_data_df.to_excel('prediksi_status_gizi.xlsx', index=False)


In [None]:
import plotly.express as px
import pandas as pd

# Setelah melakukan prediksi pada data uji
# Kita asumsikan `y_pred` adalah hasil prediksi dan `y_test` adalah label asli dari data uji

# Gabungkan hasil prediksi dengan data uji asli
test_results_df = X_test.copy()
test_results_df['Actual Status Gizi'] = status_encoder.inverse_transform(y_test)
test_results_df['Predicted Status Gizi'] = status_encoder.inverse_transform(y_pred)

# Definisikan rentang usia
age_bins = [0, 12, 24, 36, 48, 60]
age_labels = ['0-12 bulan', '13-24 bulan', '25-36 bulan', '37-48 bulan', '49-60 bulan']

# Tambahkan kolom grup usia
test_results_df['Age Group'] = pd.cut(test_results_df['Usia (bulan)'], bins=age_bins, labels=age_labels, right=False)

# Hitung frekuensi persentase berdasarkan grup usia dan status gizi yang diprediksi
status_gizi_distribution = test_results_df.groupby(['Age Group', 'Predicted Status Gizi']).size().reset_index(name='count')
status_gizi_distribution['percentage'] = status_gizi_distribution.groupby('Age Group')['count'].transform(lambda x: x / x.sum() * 100)

# Buat grafik menggunakan Plotly
fig = px.bar(status_gizi_distribution, 
             x='Age Group', 
             y='percentage', 
             color='Predicted Status Gizi', 
             barmode='group',
             labels={'percentage': 'Percentage', 'Age Group': 'Age Group (Months)', 'Predicted Status Gizi': 'Nutritional Status'},
             title='Percentage Distribution of Predicted Nutritional Status by Age Group')

# Tampilkan grafik
fig.show()


In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd

# Persiapkan data untuk visualisasi
train_dist = pd.DataFrame({
    'Status Gizi': status_encoder.inverse_transform(y_train),
    'Jenis Kelamin': gender_encoder.inverse_transform(X_train['Jenis Kelamin'])
})

test_dist = pd.DataFrame({
    'Status Gizi': status_encoder.inverse_transform(y_test),
    'Jenis Kelamin': gender_encoder.inverse_transform(X_test['Jenis Kelamin'])
})

pred_dist = pd.DataFrame({
    'Predicted Status Gizi': status_encoder.inverse_transform(y_pred),
    'Jenis Kelamin': gender_encoder.inverse_transform(X_test['Jenis Kelamin'])
})

# Buat subplots
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=("Training Data Distribution by Gender", "Test Data Distribution by Gender", "Predicted Data Distribution by Gender"),
    shared_xaxes=True
)

# Plot distribusi data pelatihan
train_fig = px.histogram(train_dist, 
                         x='Status Gizi', 
                         color='Jenis Kelamin', 
                         barmode='group',
                         color_discrete_map={'Laki-laki': 'blue', 'Perempuan': 'pink'})

# Plot distribusi data uji
test_fig = px.histogram(test_dist, 
                        x='Status Gizi', 
                        color='Jenis Kelamin', 
                        barmode='group',
                        color_discrete_map={'Laki-laki': 'blue', 'Perempuan': 'pink'})

# Plot distribusi hasil prediksi
pred_fig = px.histogram(pred_dist, 
                        x='Predicted Status Gizi', 
                        color='Jenis Kelamin', 
                        barmode='group',
                        color_discrete_map={'Laki-laki': 'blue', 'Perempuan': 'pink'})

# Menambahkan plot ke subplots
for trace in train_fig['data']:
    fig.add_trace(trace, row=1, col=1)

for trace in test_fig['data']:
    fig.add_trace(trace, row=2, col=1)

for trace in pred_fig['data']:
    fig.add_trace(trace, row=3, col=1)

# Update layout
fig.update_layout(height=900, width=700, title_text="Distribution of Nutritional Status by Gender",
                  xaxis_title="Status Gizi", yaxis_title="Count")

# Tampilkan grafik
fig.show()


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Misalkan kita menggunakan seluruh fitur untuk t-SNE
X = data_df[['Jenis Kelamin', 'Berat Badan Saat Lahir (kg)', 'Tinggi Badan Saat Lahir (cm)', 
             'Berat Badan Saat Ini (kg)', 'Tinggi Badan Saat Ini (cm)', 'Usia (bulan)', 
             'Z-Score Berat Badan', 'Z-Score Tinggi Badan']].values

y = data_df['Status Gizi']

# Normalisasi data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Terapkan t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# Buat DataFrame untuk visualisasi
tsne_df = pd.DataFrame(data=X_tsne, columns=['t-SNE 1', 't-SNE 2'])

# Terapkan inverse transform untuk mendapatkan label asli
tsne_df['Status Gizi'] = status_encoder.inverse_transform(y)

# Visualisasi dengan Plotly
fig = px.scatter(tsne_df, x='t-SNE 1', y='t-SNE 2', color='Status Gizi',
                 title='t-SNE Visualization of Nutritional Status',
                 labels={'color': 'Status Gizi'})

fig.show()
