# **First Model**

_John Andrew Dixon_

---

**Setup**

In [36]:
# For easily handling data
import pandas as pd
# For performing regression metrics
from sklearn.metrics import r2_score
# For creating a train-test split
from sklearn.model_selection import train_test_split
# For scaling and one-hot encoding
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# For selecting and transforming columns
from sklearn.compose import make_column_selector, make_column_transformer
# For creating pipelines
from sklearn.pipeline import make_pipeline
# For performing a linear regression
from sklearn.linear_model import LinearRegression

In [37]:
# Remote URL to the dataset
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vT3R_qvIyzGAYylk0aIdTFxAtcxLdjBtfEGfSyAI1PfOnr0YpN_QjbbH1j5OScoYNcoyOjY_c9tQQ0H/pub?output=csv"
# Load data and verify 
df = pd.read_csv(url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


**Numeric**: `age`, `bmi`, `children`

**Ordinal**: `None`

**Nominal**: `sex`, `smoker`, `region`

---

## **Tasks**

> **Question**: _How well can the additional charges be predicted based on the age, sex, BMI, number of children, smoking habit, and region of the patient?_ 

### **Create a preprocessing object, such as a column transformer or pipeline, that will:**
* Ordinal encode any ordinal features
* One-hot encode any nominal features
* Scale any numeric features

In [38]:
# Create the feature matrix
X = df.drop(columns="charges")
# Create the target vector
y  = df["charges"]

In [39]:
# Create the train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# Scaler for numeric features
scaler = StandardScaler()
# One-hot encoder for nomical features
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [41]:
# Column selector for numeric features
numeric_selector = make_column_selector(dtype_include="number")
# Column selector for nominal features
nominal_selector = make_column_selector(dtype_include="object")

In [42]:
# Tuple for numeric features
numeric_tuple = (scaler, numeric_selector)
# Tuple for nominal features
nominal_tuple = (one_hot_encoder, nominal_selector)

In [43]:
# ColumnTransformer as a preprocessor object
preprocessor = make_column_transformer(numeric_tuple, nominal_tuple, remainder="passthrough", verbose_feature_names_out=False)

### **Instantiate a linear regression model**

In [44]:
# A linear regression model instance
linear_regression = LinearRegression()

### **Create a model pipeline with your preprocessor first and linear regression model last.**

In [45]:
# Linear regression pipeline
lr_pipeline =  make_pipeline(preprocessor, linear_regression)

### **Fit the modeling pipeline on the training data.**

In [46]:
# Fit the modeling pipeline on the training data
lr_pipeline.fit(X_train, y_train)

### **Evaluate the model performance on both the training set and the test set using the R-squared score.**

In [47]:
# Get predictions on the training data
training_predictions = lr_pipeline.predict(X_train)
# Get the training data R2
training_r2 = r2_score(y_train, training_predictions)
# Training R2
print(f"Training R2: {training_r2}")

Training R2: 0.7417255854683333


In [48]:
# Get prdictions on the testing data
testing_predictions = lr_pipeline.predict(X_test)
# Get the testing data R2
testing_r2 = r2_score(y_test, testing_predictions)
# Training R2
print(f"Training R2: {testing_r2}")

Training R2: 0.7835929767120723
