<a href="https://colab.research.google.com/github/khaironnahar4/ml_models/blob/main/ml_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
# import all require library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats
from sklearn.preprocessing import StandardScaler

#**Data Analyze**

In [4]:
# read the file from the folder
df = pd.read_csv("/content/sample_data/HousingData.csv")

# read first 5 row
print("\n--- First 5 Rows ---")
print(df.head())

# dataset informations
print("\n--- Dataset Info ---")
df.info()

# df.describe()


--- First 5 Rows ---
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90    NaN  36.2  

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    

In [5]:
print("\n--- Missing Values ---")
# print(df.isnull().sum())

# fill the missing values
df["CRIM"] =  df["CRIM"].fillna(df["CRIM"].median())
df["ZN"] =  df["ZN"].fillna(df["ZN"].mean())
df["INDUS"] =  df["INDUS"].fillna(df["INDUS"].median())
df["CHAS"] =  df["CHAS"].fillna(0)
df["AGE"] =  df["AGE"].fillna(df["AGE"].median())
df["LSTAT"] =  df["LSTAT"].fillna(df["LSTAT"].median())


print(df.isnull().sum())


--- Missing Values ---
CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64


#**Select Features and Target Variables**

In [6]:
# Features (all columns except MEDV)
X = df.drop(["MEDV"], axis=1)

# Target variable (MEDV)
y = df['MEDV']

print("Features (X):")
display(X.head())

print("\nTarget variable (y):")
display(y.head())

Features (X):


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,11.43



Target variable (y):


Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (404, 13)
Shape of X_test: (102, 13)
Shape of y_train: (404,)
Shape of y_test: (102,)


#**Train the Model**

In [16]:
# Initialize and train the Linear Regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train[['RM']], y_train) # Using only 'RM' for simple linear regression

print("Linear Regression model trained successfully.")

Linear Regression model trained successfully.


In [11]:
# Initialize and train the Multiple Linear Regression model
multiple_linear_reg_model = LinearRegression()
multiple_linear_reg_model.fit(X_train, y_train) # Using all features for multiple linear regression

print("Multiple Linear Regression model trained successfully.")

Multiple Linear Regression model trained successfully.


In [13]:
y_pred_linear = linear_reg_model.predict(X_test[['RM']])

# Calculate evaluation metrics
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("--- Simple Linear Regression Evaluation ---")
print(f"Mean Squared Error (MSE): {mse_linear:.2f}")
print(f"R-squared (R2): {r2_linear:.2f}")

--- Simple Linear Regression Evaluation ---
Mean Squared Error (MSE): 46.14
R-squared (R2): 0.37


In [14]:
# Make predictions on the test set
y_pred_multiple = multiple_linear_reg_model.predict(X_test)

# Calculate evaluation metrics
mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)

print("\n--- Multiple Linear Regression Evaluation ---")
print(f"Mean Squared Error (MSE): {mse_multiple:.2f}")
print(f"R-squared (R2): {r2_multiple:.2f}")


--- Multiple Linear Regression Evaluation ---
Mean Squared Error (MSE): 25.01
R-squared (R2): 0.66


#**Feature Scaling**

In [84]:
# Remove outliers from the 'MEDV' column
df_cleaned = df[~(df["MEDV"]>=50.0)]

print("Shape of df before removing 'MEDV' outliers:", df.shape)
print("Shape of df after removing 'MEDV' outliers:", df_cleaned.shape)

# display(df_cleaned.head())

cols_to_transform = ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE', "CRIM", "ZN", "RAD","B"]

df_transformed_log = df_cleaned.copy()

for col in cols_to_transform:
  if np.abs(df_transformed_log[col].skew()) > 0.3:
    df_transformed_log[col] = np.log1p(df_cleaned[col])


Shape of df before removing 'MEDV' outliers: (506, 14)
Shape of df after removing 'MEDV' outliers: (490, 14)


In [105]:
# Features (all columns except MEDV)
X = df_transformed_log.drop(["MEDV"], axis=1)
# X = df_transformed_log.drop(["MEDV", "CHAS"], axis=1)
# Target variable (MEDV)
y = df_transformed_log['MEDV']

print("Features (X):")
display(X.head())

print("\nTarget variable (y):")
display(y.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Features (X):


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.0063,2.944439,1.196948,0.0,0.430483,6.575,4.19268,1.627278,0.693147,5.693732,2.791165,5.986201,1.788421
1,0.026944,0.0,2.088153,0.0,0.384582,6.421,4.380776,1.786261,1.098612,5.493061,2.933857,5.986201,2.316488
2,0.026924,0.0,2.088153,0.0,0.384582,7.185,4.128746,1.786261,1.098612,5.493061,2.933857,5.975919,1.61542
3,0.031857,0.0,1.156881,0.0,0.377066,6.998,3.845883,1.954757,1.386294,5.407172,2.980619,5.980479,1.371181
4,0.06677,0.0,1.156881,0.0,0.377066,7.147,4.010963,1.954757,1.386294,5.407172,2.980619,5.986201,2.520113



Target variable (y):


Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


Shape of X_train: (392, 13)
Shape of X_test: (98, 13)
Shape of y_train: (392,)
Shape of y_test: (98,)


In [106]:
# Initialize and train the Linear Regression model
linear_reg_model = LinearRegression()
# Select the 'RM' column from the original DataFrame before scaling
linear_reg_model.fit(X_train[['RM']], y_train) # Using only 'RM' for simple linear regression

print("Linear Regression model trained successfully.")

# Initialize and train the Multiple Linear Regression model
multiple_linear_reg_model = LinearRegression()
multiple_linear_reg_model.fit(X_train, y_train) # Using all features for multiple linear regression

print("Multiple Linear Regression model trained successfully.")

Linear Regression model trained successfully.
Multiple Linear Regression model trained successfully.


In [100]:
y_pred_linear = linear_reg_model.predict(X_test[['RM']])

# Calculate evaluation metrics
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("--- Simple Linear Regression Evaluation ---")
print(f"Mean Squared Error (MSE): {mse_linear:.2f}")
print(f"R-squared (R2): {r2_linear:.2f}")

# Make predictions on the test set
y_pred_multiple = multiple_linear_reg_model.predict(X_test)

# Calculate evaluation metrics
mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)

print("\n--- Multiple Linear Regression Evaluation ---")
print(f"Mean Squared Error (MSE): {mse_multiple:.2f}")
print(f"R-squared (R2): {r2_multiple:.2f}")

--- Simple Linear Regression Evaluation ---
Mean Squared Error (MSE): 31.76
R-squared (R2): 0.38

--- Multiple Linear Regression Evaluation ---
Mean Squared Error (MSE): 12.44
R-squared (R2): 0.76


In [108]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\n--- Random Forest Regressor Evaluation ---")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"R-squared (R2): {r2_rf:.2f}")


--- Random Forest Regressor Evaluation ---
Mean Squared Error (MSE): 7.90
R-squared (R2): 0.85


In [112]:
# feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
# print("\nTop 5 Feature Importances:")
# print(feature_importances.nlargest(12))