In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")


Link to Dataset: https://www.kaggle.com/datasets/vipullrathod/fish-market

In [95]:
data = pd.read_csv('Fish.csv') 

# Analysis And Preprocessing

In [96]:
data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [97]:
data.shape

(159, 7)

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


#### Step 1: Checking Missing Values.

In [99]:
data.isnull().sum()

Species    0
Weight     0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

#### Step 2: Handling Missing Values.

As it contains no missing value. No need to handle it.

#### Step 3: Preparing for Categorical Encoding

In [100]:
numericals = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoricals = data.select_dtypes(include=['object']).columns.tolist()

In [101]:
print("Categorical Columns:", categoricals)
print("Numerical Columns:", numericals)

Categorical Columns: ['Species']
Numerical Columns: ['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']


In [102]:
data = data[(data['Weight'] > 0) & (data['Height'] > 0)]

In [103]:
data.shape

(158, 7)

In [104]:
data.columns

Index(['Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height',
       'Width'],
      dtype='object')

#### Step 4: Handling Outliers

In [105]:
numerical = data.select_dtypes(include=['float64', 'int64']).columns
def outliers(data):
    outliers = {}
    for column in data.columns:
        if column in numerical:
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers[column] = data[(data[column] < lower_bound) | (data[column] > upper_bound)].index.tolist()

    return outliers

In [106]:
outliers_dict = outliers(data)

In [107]:
columns_with_outliers = [column for column, indices in outliers_dict.items() if len(indices) > 0]

if len(columns_with_outliers) == 0:
    print("Data have no outliers.")
else:
    print(f"Outliers exists in: ",columns_with_outliers )

Outliers exists in:  ['Weight', 'Length1', 'Length2', 'Length3']


In [108]:
outliers_indices = outliers_dict.get('Weight', [])
print(f"Indices of outliers in the 'Weight' column: {', '.join(map(str, outliers_indices))}")
outliers_weight_values = data.loc[outliers_indices, 'Weight']

if not outliers_weight_values.empty:
    print(f"Values of outliers in the 'Weight' column:\n{outliers_weight_values}")

Indices of outliers in the 'Weight' column: 142, 143, 144
Values of outliers in the 'Weight' column:
142    1600.0
143    1550.0
144    1650.0
Name: Weight, dtype: float64


In [109]:
for column in ['Length1', 'Length2', 'Length3']:
    outliers_indices = outliers_dict.get(column, [])
    print(f"Indices of outliers in the '{column}' column: {', '.join(map(str, outliers_indices))}")
    outliers_length_values = data.loc[outliers_indices, column]
    if not outliers_length_values.empty:
        print(f"Values of outliers in the '{column}' column:\n{outliers_length_values}")

Indices of outliers in the 'Length1' column: 142, 143, 144
Values of outliers in the 'Length1' column:
142    56.0
143    56.0
144    59.0
Name: Length1, dtype: float64
Indices of outliers in the 'Length2' column: 142, 143, 144
Values of outliers in the 'Length2' column:
142    60.0
143    60.0
144    63.4
Name: Length2, dtype: float64
Indices of outliers in the 'Length3' column: 144
Values of outliers in the 'Length3' column:
144    68.0
Name: Length3, dtype: float64


In [110]:
data = data.drop([142, 143, 144])

# Applying Model

In [111]:
X = data[['Length1', 'Length2', 'Length3', 'Height', 'Width']]
y = data['Weight']

In [112]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [118]:
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def backward_elimination(X, y, significance_level=0.05):  
    X = sm.add_constant(X, has_constant="add")  
    features = list(X.columns)
    
    while len(features) > 1:  
        model = sm.OLS(y, X[features]).fit()
        p_values = model.pvalues.iloc[1:]  
        max_p_value = p_values.max()
        if max_p_value > significance_level:
            feature_to_remove = p_values.idxmax()
            features.remove(feature_to_remove)
        else:
            break
    return X[features]

def forward_selection(X, y, significance_level=0.05):
    X = sm.add_constant(X, has_constant="add")  # Ensure intercept
    selected_features = ['const'] if 'const' in X.columns else []  # Fix intercept handling
    remaining_features = list(X.columns)

    while remaining_features:
        best_feature = None
        best_p_value = significance_level

        for feature in remaining_features:
            if feature not in selected_features:
                model = sm.OLS(y, X[selected_features + [feature]]).fit()
                p_value = model.pvalues[feature]
                if p_value < best_p_value:
                    best_feature = feature
                    best_p_value = p_value

        if best_feature:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break

    return X[selected_features]

def bidirectional_selection(X, y, significance_level=0.05):
    X = sm.add_constant(X, has_constant="add")
    selected_features = forward_selection(X, y, significance_level).columns.tolist()
    return backward_elimination(X[selected_features], y, significance_level)


{'Backward Elimination': {'R²': 0.9766853119589314,
  'Adjusted R²': 0.9562849599229963},
 'Forward Selection': {'R²': 0.9685965555566534,
  'Adjusted R²': 0.9623158666679841},
 'Bidirectional Selection': {'R²': 0.9685965555566534,
  'Adjusted R²': 0.9623158666679841},
 'All Variables': {'R²': 0.9732097359774904,
  'Adjusted R²': 0.9196292079324712}}

In [119]:
X_train = sm.add_constant(X_train, has_constant="add")  
X_test = sm.add_constant(X_test, has_constant="add")

X_backward = backward_elimination(X_train, y_train)
X_forward = forward_selection(X_train, y_train)
X_bidirectional = bidirectional_selection(X_train, y_train)

results = {}

for method, X_selected in zip(["Backward Elimination", "Forward Selection", "Bidirectional Selection", "All Variables"],
                               [X_backward, X_forward, X_bidirectional, X_train]):

    # Remove intercept if present
    if 'const' in X_selected.columns:
        X_selected = X_selected.drop(columns=['const'])


    poly = PolynomialFeatures(degree=2, include_bias=False)  
    X_train_poly = poly.fit_transform(X_selected)
    X_test_poly = poly.transform(X_test[X_selected.columns])

 
    model = LinearRegression()
    model.fit(X_train_poly, y_train)    
    y_pred = model.predict(X_test_poly)

    r2 = r2_score(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, X_test.shape[0], X_train_poly.shape[1])
    results[method] = {"R²": r2, "Adjusted R²": adj_r2}

                               R²  Adjusted R²
Backward Elimination     0.976685     0.956285
Forward Selection        0.968597     0.962316
Bidirectional Selection  0.968597     0.962316
All Variables            0.973210     0.919629


In [120]:
results_df = pd.DataFrame.from_dict(results, orient="index")
print(results_df)

                               R²  Adjusted R²
Backward Elimination     0.976685     0.956285
Forward Selection        0.968597     0.962316
Bidirectional Selection  0.968597     0.962316
All Variables            0.973210     0.919629


#### Observations:
- Backward Elimination performed best in terms of R² but slightly lower Adjusted R².
- Forward and Bidirectional Selection resulted in similar performance.
- All Variables Model has a lower Adjusted R², suggesting possible overfitting.