In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Task 1 (a): Cleaning the Dataset, Handling Missing Values, and Removing Outliers

In [9]:
# Import pandas library for data manipulation
import pandas as pd

# Load the dataset into a pandas DataFrame
df = pd.read_csv("C:\\Users\\harip\OneDrive\\Desktop\\Data analytics\\machine leaning\\assessment1\\dataset\\expenses.csv")


# Display the first few rows to get an overview of the data
initial_view = df.head()
initial_view 


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display the missing values count per column
missing_values_count = missing_values[missing_values > 0]
missing_values_count  # Display the variable to see the missing values in the report

# If missing values are found, opt to remove these rows
if missing_values_count.any():
    df = df.dropna()

# Display the shape of the dataset after removing missing values
post_missing_values_removal_shape = df.shape
post_missing_values_removal_shape  # Display the variable to see the new shape


(1338, 7)

In [15]:
from scipy import stats
import numpy as np

# Calculate the Z-score for each data point
z_scores = stats.zscore(df.select_dtypes(include=[np.number]))
abs_z_scores = np.abs(z_scores)

# Define a threshold for identifying outliers
threshold = 3

# Remove outliers
df_clean = df[(abs_z_scores < threshold).all(axis=1)]

# Display the shape of the dataset after removing outliers
post_outliers_removal_shape = df_clean.shape
post_outliers_removal_shape  # Display the variable to see the new shape after outlier removal


(1309, 7)

# b) Feature Scaling and Normalization

In [16]:
from sklearn.preprocessing import StandardScaler

# Assuming df_clean is your DataFrame after removing missing values and outliers

# Initialize the StandardScaler
scaler = StandardScaler()

# List of numerical features to scale (excluding 'charges' as it is the target variable)
numerical_features = ['age', 'bmi', 'children']

# Create a copy of the DataFrame to avoid the SettingWithCopyWarning
# and apply scaling to the numerical features directly
df_clean.loc[:, numerical_features] = scaler.fit_transform(df_clean[numerical_features])

# Display the scaled features to verify the scaling
scaled_features_preview = df_clean.head()
print(scaled_features_preview)



        age     sex       bmi  children smoker     region      charges
0 -1.439063  female -0.449359 -0.929616    yes  southwest  16884.92400
1 -1.510086    male  0.533391 -0.040093     no  southeast   1725.55230
2 -0.799859    male  0.404478  1.738954     no  southeast   4449.46200
3 -0.444746    male -1.319101 -0.929616     no  northwest  21984.47061
4 -0.515769    male -0.285288 -0.929616     no  northwest   3866.85520


# c) Encoding Categorical Variables

In [17]:
from sklearn.preprocessing import OneHotEncoder

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Select categorical data
categorical_features = ['sex', 'smoker', 'region']

# Assuming df_clean is your DataFrame after scaling numerical features
# Fit the encoder and transform the categorical data
encoded_features = encoder.fit_transform(df_clean[categorical_features])

# Create a DataFrame with the encoded variables
encoded_vars_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Concatenate the encoded variables with the original DataFrame
# Drop original categorical features to avoid redundancy
df_final = pd.concat([df_clean.drop(categorical_features, axis=1).reset_index(drop=True), encoded_vars_df.reset_index(drop=True)], axis=1)

# Display the DataFrame to verify the encoding
encoded_features_preview = df_final.head()
print(encoded_features_preview)


        age       bmi  children      charges  sex_female  sex_male  smoker_no  \
0 -1.439063 -0.449359 -0.929616  16884.92400         1.0       0.0        0.0   
1 -1.510086  0.533391 -0.040093   1725.55230         0.0       1.0        1.0   
2 -0.799859  0.404478  1.738954   4449.46200         0.0       1.0        1.0   
3 -0.444746 -1.319101 -0.929616  21984.47061         0.0       1.0        1.0   
4 -0.515769 -0.285288 -0.929616   3866.85520         0.0       1.0        1.0   

   smoker_yes  region_northeast  region_northwest  region_southeast  \
0         1.0               0.0               0.0               0.0   
1         0.0               0.0               0.0               1.0   
2         0.0               0.0               0.0               1.0   
3         0.0               0.0               1.0               0.0   
4         0.0               0.0               1.0               0.0   

   region_southwest  
0               1.0  
1               0.0  
2               0.0 



# Splitting the Dataset into Training and Testing Sets

In [18]:
from sklearn.model_selection import train_test_split

# Assuming df_final is your preprocessed DataFrame
# Define the features (X) and the target (y). Here, 'charges' is considered as the target variable.
X = df_final.drop('charges', axis=1)
y = df_final['charges']

# Split the data into training and testing sets. Here, we'll use 80% of the data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets to verify the split
print("Training set shape (X_train):", X_train.shape)
print("Testing set shape (X_test):", X_test.shape)
print("Training target shape (y_train):", y_train.shape)
print("Testing target shape (y_test):", y_test.shape)


Training set shape (X_train): (1047, 11)
Testing set shape (X_test): (262, 11)
Training target shape (y_train): (1047,)
Testing target shape (y_test): (262,)


# Building a Linear Regression Model

In [20]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Fit the model on the training data
lr_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model's performance
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Performance:")
print("Mean Absolute Error:", mae_lr)
print("Mean Squared Error:", mse_lr)
print("R^2 Score:", r2_lr)


Linear Regression Performance:
Mean Absolute Error: 3969.0341267772073
Mean Squared Error: 30444091.526395813
R^2 Score: 0.7787373616773112


# Building a Random Forest Regressor Model

In [19]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model's performance
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regressor Performance:")
print("Mean Absolute Error:", mae_rf)
print("Mean Squared Error:", mse_rf)
print("R^2 Score:", r2_rf)


Random Forest Regressor Performance:
Mean Absolute Error: 2277.369279230782
Mean Squared Error: 18336632.019949064
R^2 Score: 0.8667323813828205


In [23]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [None, 10],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters found:", grid_search.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found: {'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 100}


In [24]:
# Use the best parameters to build the optimized model
rf_optimized = grid_search.best_estimator_

# Make predictions on the test set
y_pred_rf_optimized = rf_optimized.predict(X_test)

# Evaluate the optimized model's performance
mae_rf_optimized = mean_absolute_error(y_test, y_pred_rf_optimized)
mse_rf_optimized = mean_squared_error(y_test, y_pred_rf_optimized)
r2_rf_optimized = r2_score(y_test, y_pred_rf_optimized)

# Performance Output
print("Optimized Random Forest Regressor Performance:")
print("Mean Absolute Error:", mae_rf_optimized)
print("Mean Squared Error:", mse_rf_optimized)
print("R^2 Score:", r2_rf_optimized)


Optimized Random Forest Regressor Performance:
Mean Absolute Error: 2199.4804595509477
Mean Squared Error: 16487985.868555585
R^2 Score: 0.8801680368507382


In [27]:
# Linear Regression Evaluation
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Optimized Random Forest Evaluation
mae_rf_optimized = mean_absolute_error(y_test, y_pred_rf_optimized)
mse_rf_optimized = mean_squared_error(y_test, y_pred_rf_optimized)
r2_rf_optimized = r2_score(y_test, y_pred_rf_optimized)


# b) Implement k-Fold Cross-Validation

In [25]:
from sklearn.model_selection import cross_val_score

# For Linear Regression
cv_scores_lr = cross_val_score(lr_model, X, y, cv=10, scoring='r2')
print("Linear Regression CV R² Scores:", cv_scores_lr)
print("Average R² Score:", cv_scores_lr.mean())

# For Optimized Random Forest
cv_scores_rf = cross_val_score(rf_optimized, X, y, cv=10, scoring='r2')
print("Optimized Random Forest CV R² Scores:", cv_scores_rf)
print("Average R² Score:", cv_scores_rf.mean())


Linear Regression CV R² Scores: [0.79204911 0.73211526 0.73499309 0.67780041 0.7576718  0.80445118
 0.82523224 0.63082728 0.74551721 0.7814957 ]
Average R² Score: 0.7482153271170179
Optimized Random Forest CV R² Scores: [0.88523494 0.85205007 0.82474848 0.76192324 0.83451363 0.92637766
 0.89613065 0.76949972 0.85726054 0.88227188]
Average R² Score: 0.8490010794688153


# c) Select the Best-Performing Regression Model