### Step 1: Importing Packages

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

### Step 2: Data preparation

In [21]:
data ={
    'Height': [160,  170,  np.nan, 180], 
    'Weight': [55,   65,   70,     np.nan],
    'Gender': ['Male', 'Female', 'Female', 'Male']
}
df= pd.DataFrame(data)
print(df)

   Height  Weight  Gender
0   160.0    55.0    Male
1   170.0    65.0  Female
2     NaN    70.0  Female
3   180.0     NaN    Male


### Step 3: Handing Missing Values

In [41]:
df['Height'].fillna(df['Height'].mean(),inplace=True)
df['Weight'].fillna(df['Weight'].mean(),inplace=True)

print(df)

   Height     Weight  Gender
0   160.0  55.000000    Male
1   170.0  65.000000  Female
2   170.0  70.000000  Female
3   180.0  63.333333    Male


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Weight'].fillna(df['Weight'].mean(),inplace=True)


### Step 4: Encoding

In [48]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
print(df['Gender'])

0    1
1    0
2    0
3    1
Name: Gender, dtype: int32


### Step 5: Scaling

In [55]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled, columns=['Height','Weight','Gender'])
print(scaled_df)

   Height    Weight  Gender
0     0.0  0.000000     1.0
1     0.5  0.666667     0.0
2     0.5  1.000000     0.0
3     1.0  0.555556     1.0


## Example 2

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Create dataset
data = {'Hours_Studied': [2, 4, 6, 8, 10],
        'Final_Score': [50, 55, 60, 80, 90]}
df = pd.DataFrame(data)

# Step 2: Split data
X = df[['Hours_Studied']]
y = df['Final_Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Step 3: Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 4: Predict
y_pred = model.predict(X_test)

# Step 5: Evaluate

print("Actual:", y_test.values)
print("Predicted:", y_pred)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Actual: [60]
Predicted: [68.75]
MAE: 8.75
MSE: 76.5625
R² Score: nan


