### 1. Load and Explore the Data

In [21]:
import pandas as pd

In [57]:
# Load the dataset
data = pd.read_csv('/content/Obesity.csv')

In [58]:
# Explore the dataset
print(data.head())

   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2.0  0.0  Frequently   
4  1.0  Sometimes    no   2.0   no  0.0  0.0   Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportation        Normal_Weight  
2  Public_Transportation        

### 2. Data Preprocessing

In [59]:
# Handle missing values if any
data = data.dropna()


In [60]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

In [26]:
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['MTRANS'] = label_encoder.fit_transform(data['MTRANS'])


### 3. Define Features and Target Variable



In [27]:
X = data.drop(['Weight'], axis=1)
y = data['Weight']


### 4. Split the dataset into training and testing sets

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 5. Feature Scaling

In [30]:
# Identify non-numeric columns
non_numeric_columns = X_train.select_dtypes(include='object').columns

In [31]:
# Standardize only numeric columns
numeric_columns = X_train.columns.difference(non_numeric_columns)
scaler = StandardScaler()
X_train_scaled = X_train.copy()  # Create a copy to avoid modifying the original DataFrame

In [32]:
# Standardize only numeric columns
X_train_scaled[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])

In [33]:
# Apply the same transformation to the test set
X_test_scaled = X_test.copy()
X_test_scaled[numeric_columns] = scaler.transform(X_test[numeric_columns])

In [35]:
# Identify columns with string values in X_train
non_numeric_columns_train = X_train.select_dtypes(include='object').columns

# Identify columns with string values in X_test
non_numeric_columns_test = X_test.select_dtypes(include='object').columns

# Print the identified columns
print("Columns with string values in X_train:", non_numeric_columns_train)
print("Columns with string values in X_test:", non_numeric_columns_test)

Columns with string values in X_train: Index(['family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC',
       'CALC', 'NObeyesdad'],
      dtype='object')
Columns with string values in X_test: Index(['family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC',
       'CALC', 'NObeyesdad'],
      dtype='object')


### 6. Choose a Model and Train

In [39]:
# Print feature names in the training set
print("Feature names in the training set:", X_train_scaled.columns)

# Print feature names in the test set
print("Feature names in the test set:", X_test_scaled.columns)

Feature names in the training set: Index(['Gender', 'Age', 'Height', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'MTRANS', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_no', 'SMOKE_yes', 'SCC_no', 'SCC_yes', 'CALC_Always',
       'CALC_Frequently', 'CALC_Sometimes', 'CALC_no',
       'NObeyesdad_Insufficient_Weight', 'NObeyesdad_Normal_Weight',
       'NObeyesdad_Obesity_Type_I', 'NObeyesdad_Obesity_Type_II',
       'NObeyesdad_Obesity_Type_III', 'NObeyesdad_Overweight_Level_I',
       'NObeyesdad_Overweight_Level_II'],
      dtype='object')
Feature names in the test set: Index(['Gender', 'Age', 'Height', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'MTRANS', 'family_history_with_overweight_no',
       'family_history_with_overweight_yes', 'FAVC_no', 'FAVC_yes',
       'CAEC_Always', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no',
       'SMOKE_n

In [40]:
# Ensure consistent feature names
common_features = set(X_train_scaled.columns) & set(X_test_scaled.columns)

# Use only common features for predictions
X_test_scaled_common = X_test_scaled[common_features]

  X_test_scaled_common = X_test_scaled[common_features]


In [43]:
# Reorder columns in X_test_scaled_common to match the order in X_train_scaled
X_test_scaled_common = X_test_scaled_common[X_train_scaled.columns]

### 7. Make Predictions

In [52]:
# Fit the model
model.fit(X_train_scaled, y_train_encoded)

# Make predictions using the modified and reordered test set
y_pred = model.predict(X_test_scaled_common)

### 8. Evaluate the Model

In [54]:
from sklearn.metrics import mean_absolute_error

# Evaluate regression performance
mae = mean_absolute_error(y_test, y_pred)

# Print or use the MAE as needed
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 567.3924300813397
