In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the data
data = pd.read_csv("./csv_result-bodyfat.csv")  # Replace with the actual path to your CSV file

# Check the data to ensure it's loaded correctly
print(data.head())  # Display the first few rows of the dataset
print(data.info())  # Display information about the dataset

# Data Preprocessing
if "class" in data.columns and "id" in data.columns:
    X = data.drop(columns=["class", "id"])  # Features (remove "class" and "id" columns)
    y = data["class"]  # Target variable

    # Check class distribution
    print("Class distribution:")
    print(y.value_counts())

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the feature data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Model Building
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Model Evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

    # Data Visualization
    plt.figure(figsize=(12, 8))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
    plt.title("Correlation Heatmap")
    plt.show()

    # Predict using the model (example)
    sample_data = np.array([1.0708, 23, 154.25, 67.75, 36.2, 93.1, 85.2, 94.5, 59, 37.3, 21.9, 32, 27.4, 17.1, 12.3]).reshape(1, -1)
    sample_data = scaler.transform(sample_data)
    prediction = model.predict(sample_data)
    print(f"Predicted class: {prediction[0]}")
else:
    print("The 'class' and 'id' columns are not present in the dataset. Please check the data file.")


   id  Density  Age  Weight  Height  Neck  Chest  Abdomen    Hip  Thigh  Knee  \
0   1   1.0708   23  154.25   67.75  36.2   93.1     85.2   94.5   59.0  37.3   
1   2   1.0853   22  173.25   72.25  38.5   93.6     83.0   98.7   58.7  37.3   
2   3   1.0414   22  154.00   66.25  34.0   95.8     87.9   99.2   59.6  38.9   
3   4   1.0751   26  184.75   72.25  37.4  101.8     86.4  101.2   60.1  37.3   
4   5   1.0340   24  184.25   71.25  34.4   97.3    100.0  101.9   63.2  42.2   

   Ankle  Biceps  Forearm  Wrist  class  
0   21.9    32.0     27.4   17.1   12.3  
1   23.4    30.5     28.9   18.2    6.1  
2   24.0    28.8     25.2   16.6   25.3  
3   22.8    32.4     29.4   18.2   10.4  
4   24.0    32.2     27.7   17.7   28.7  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       252 non-null    int64  
 1   Density  252 non-null    float64
 2  

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.