In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('Medical_insurance.csv')

In [4]:
data.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [5]:
data['region'].value_counts().sort_values()

northeast    658
northwest    664
southwest    684
southeast    766
Name: region, dtype: int64

In [6]:
#Encoding the categorical features
clean_data = {'sex': {'male' : 0 , 'female' : 1} ,
                 'smoker': {'no': 0 , 'yes' : 1},
                   'region' : {'northwest':0, 'northeast':1,'southeast':2,'southwest':3}
               }
data_copy = data.copy()
data_copy.replace(clean_data, inplace=True)


In [7]:
print(data['sex'].value_counts().sort_values())
print(data['region'].value_counts().sort_values())
print(data['children'].value_counts().sort_values())

female    1366
male      1406
Name: sex, dtype: int64
northeast    658
northwest    664
southwest    684
southeast    766
Name: region, dtype: int64
5      42
4      52
3     324
2     496
1     672
0    1186
Name: children, dtype: int64


In [26]:
#splitting train test split
X= data_copy.drop(columns='charges', axis=1)
Y=data_copy['charges']

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from scipy.stats import uniform


# Custom Linear Regression Class
class MyLR:
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X_train, Y_train):
        if not isinstance(X_train, np.ndarray):
            X_train = X_train.to_numpy()
        if not isinstance(Y_train, np.ndarray):
            Y_train = Y_train.to_numpy()
        X_train = np.insert(X_train, 0, 1, axis=1)  # Add bias term
        betas = np.linalg.inv(np.dot(X_train.T, X_train)).dot(X_train.T).dot(Y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]

    def predict(self, X_test):
        if not isinstance(X_test, np.ndarray):
            X_test = X_test.to_numpy()
        y_pred = np.dot(X_test, self.coef_) + self.intercept_
        return y_pred

# Perform 10 different random train-test splits and evaluate R² score
r2_scores = []

for i in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random.randint(1, 1000))

    model = MyLR()
    model.fit(X_train, Y_train)
    Y_test_pred = model.predict(X_test)

    r2 = r2_score(Y_test, Y_test_pred)
    r2_scores.append(r2)
    print(f"Run {i+1}: R² Score (Custom MyLR) = {r2:.4f}")

# Print Average R² Score for MyLR
print(f"\nAverage R² Score for MyLR over 10 runs: {np.mean(r2_scores):.4f}")

# -------------------- Apply Grid Search CV --------------------
print("\nApplying Grid Search CV on Ridge Regression...")

param_grid = {"alpha": [0.01, 0.1, 1, 10, 100]}
ridge = Ridge()

grid_search = GridSearchCV(ridge, param_grid, cv=10, scoring="r2", n_jobs=-1)
grid_search.fit(X, Y)

print(f"Best alpha (Grid Search): {grid_search.best_params_}")
print(f"Best R² score (Grid Search): {grid_search.best_score_:.4f}")

# -------------------- Apply Randomized Search CV --------------------
print("\nApplying Randomized Search CV on Ridge Regression...")

param_dist = {"alpha": uniform(0.01, 100)}
random_search = RandomizedSearchCV(Ridge(), param_distributions=param_dist, 
                                   n_iter=10, cv=10, scoring="r2", random_state=42, n_jobs=-1)
random_search.fit(X, Y)

print(f"Best alpha (Randomized Search): {random_search.best_params_}")
print(f"Best R² score (Randomized Search): {random_search.best_score_:.4f}")


Run 1: R² Score (Custom MyLR) = 0.7421
Run 2: R² Score (Custom MyLR) = 0.7509
Run 3: R² Score (Custom MyLR) = 0.7509
Run 4: R² Score (Custom MyLR) = 0.7347
Run 5: R² Score (Custom MyLR) = 0.7616
Run 6: R² Score (Custom MyLR) = 0.7497
Run 7: R² Score (Custom MyLR) = 0.7632
Run 8: R² Score (Custom MyLR) = 0.7071
Run 9: R² Score (Custom MyLR) = 0.7812
Run 10: R² Score (Custom MyLR) = 0.7215

Average R² Score for MyLR over 10 runs: 0.7463

Applying Grid Search CV on Ridge Regression...
Best alpha (Grid Search): {'alpha': 1}
Best R² score (Grid Search): 0.7480

Applying Randomized Search CV on Ridge Regression...
Best alpha (Randomized Search): {'alpha': 5.818361216819946}
Best R² score (Randomized Search): 0.7479


In [10]:
import numpy as np

class MyLR:
    def __init__(self):
        self.coef_ = None 
        self.intercept_ = None
    def fit(self, X_train, Y_train):
        if not isinstance(X_train, np.ndarray):
            X_train = X_train.to_numpy()
        if not isinstance(Y_train, np.ndarray):
            Y_train = Y_train.to_numpy()
        X_train = np.insert(X_train, 0, 1, axis=1)
        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(Y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]
    def predict(self, X_test):
        if not isinstance(X_test, np.ndarray):
            X_test = X_test.to_numpy()
        y_pred = np.dot(X_test, self.coef_) + self.intercept_
        return y_pred

In [23]:
ml = LinearRegression()

In [30]:
#function to take input and predict
#main
import numpy as np
data_field = ["age","sex","bmi","childeren","smoker","region"]
input_data11 = []
print("Please enter six numerical values:")
for i in data_field:
    while True:
        try:
            value = float(input(f"Enter value {i} : "))
            input_data11.append(value)
            break
        except ValueError:
            print("Invalid input. Please enter a numerical value.")
    
numpy_array = np.asarray(input_data11)
print("Numpyarrayyy :",numpy_array)
    


    
reshaped_np_array = numpy_array.reshape(1,-1) 

    
import numpy as np
# Example usage
#if __name__ == "__main__":
X_test = reshaped_np_array

print("Your Reshaped NumPy array:", X_test)
prediction=model.predict(X_test)
print("Cost: $",prediction)



Please enter six numerical values:


Enter value age :  19
Enter value sex :  1
Enter value bmi :  27.9
Enter value childeren :  0
Enter value smoker :  1
Enter value region :  3


Numpyarrayyy : [19.   1.  27.9  0.   1.   3. ]
Your Reshaped NumPy array: [[19.   1.  27.9  0.   1.   3. ]]
Cost: $ [24665.13935455]
