# Body Fat Prediction: Data Analysis and Model Building

## 1. Data Loading

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("bodyfat.csv")

# Display first few rows
df.head()
    

## 2. Data Analysis

In [None]:

# Check data information
df.info()
    

In [None]:

# Summary statistics
df.describe()
    

## 3. Correlation Analysis

In [None]:

# Compute correlation matrix
correlation_matrix = df.corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Matrix")
plt.show()
    

In [None]:

# Display correlation with target variable
correlation_with_target = correlation_matrix["BodyFat"].sort_values(ascending=False)
correlation_with_target
    

## 4. Feature Selection and Data Preparation

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select most important features
selected_features = ["Abdomen", "Chest", "Hip", "Weight", "Thigh", "Knee", "Biceps", "Neck"]

# Split data into training and testing sets
X = df[selected_features]
y = df["BodyFat"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    

## 5. Model Training and Comparison

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define models for comparison
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Create a pipeline for preprocessing and training
results = {}
for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", model)
    ])
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluate model performance
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MAE": mae, "MSE": mse, "R2": r2}

# Convert results to DataFrame for easy comparison
results_df = pd.DataFrame(results).T

# Display results
results_df
    

## 6. Training the Final Model

In [None]:

import joblib

# Final pipeline with RandomForest
final_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train on full dataset
final_pipeline.fit(X, y)

# Save model
joblib.dump(final_pipeline, "bodyfat_model.pkl")
    

## 7. Deployment Script for Predictions

In [None]:

prod_code = """
import pandas as pd
import joblib

# Load model
model = joblib.load("bodyfat_model.pkl")

def predict_bodyfat(data):
    """Takes a DataFrame with input data and returns predictions."""
    return model.predict(data)

# Example usage
if __name__ == "__main__":
    sample_data = pd.DataFrame([[85, 95, 100, 180, 60, 38, 32, 37]], 
                               columns=["Abdomen", "Chest", "Hip", "Weight", "Thigh", "Knee", "Biceps", "Neck"])
    prediction = predict_bodyfat(sample_data)
    print("Predicted body fat percentage:", prediction)
"""

# Save script
with open("predict_bodyfat.py", "w", encoding="utf-8") as f:
    f.write(prod_code)
    

Now you have a trained model `bodyfat_model.pkl` and a script `predict_bodyfat.py` for making predictions! 🚀