# Linear Regression with Scikit Learn


In [None]:
#restart the kernel after installation
!pip install pandas --quiet

In [None]:
medical_charges_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'

In [None]:
from urllib.request import urlretrieve

In [None]:
urlretrieve(medical_charges_url, 'medical.csv')

Understanding the Data

In [None]:
import pandas as pd

In [None]:
medical_df = pd.read_csv('medical.csv')

In [None]:
medical_df

In [None]:
medical_df.info()

In [None]:
medical_df.describe()

Exploratory Data Analysis and Visualization

In [None]:
!pip install plotly matplotlib seaborn --quiet

In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
medical_df.age.describe()

Visualize Age

In [None]:
fig = px.histogram(medical_df,
                   x='age',
                   marginal='box',
                   nbins=47,
                   title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()

Visualize BMI (Body Mass Index)

In [None]:
fig = px.histogram(medical_df,
                   x='bmi',
                   marginal='box',
                   color_discrete_sequence=['red'],
                   title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()

Visualize Annual Medical Charges

In [None]:
fig = px.histogram(medical_df,
                   x='charges',
                   marginal='box',
                   color='smoker',
                   color_discrete_sequence=['green', 'grey'],
                   title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()

Visualize Smoker count

In [None]:
medical_df.smoker.value_counts()

In [None]:
px.histogram(medical_df, x='smoker', color='sex', title='Smoker')

Age vs Charges

In [None]:
fig = px.scatter(medical_df,
                 x='age',
                 y='charges',
                 color='smoker',
                 opacity=0.8,
                 hover_data=['sex'],
                 title='Age vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

BMI vs Charges

In [None]:
fig = px.scatter(medical_df,
                 x='bmi',
                 y='charges',
                 color='smoker',
                 opacity=0.8,
                 hover_data=['sex'],
                 title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

### Correlation

In [None]:
medical_df.charges.corr(medical_df.age)

In [None]:
medical_df.charges.corr(medical_df.bmi)

In [None]:
smoker_values = {'no': 0, 'yes': 1}
smoker_numeric = medical_df.smoker.map(smoker_values)
medical_df.charges.corr(smoker_numeric)

In [None]:
# Calculate the correlation matrix for numeric columns only
numeric_df = medical_df.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()

# Display the correlation matrix
print(correlation_matrix)

In [None]:
sns.heatmap(numeric_df.corr(), cmap='Reds', annot=True)
plt.title('Correlation Matrix');

## Linear Regression using a Single Feature

In [None]:
non_smoker_df = medical_df[medical_df.smoker == 'no']

In [None]:
plt.title('Age vs. Charges')
sns.scatterplot(data=non_smoker_df, x='age', y='charges', alpha=0.7, s=15);

In [None]:
def estimate_charges(age, w, b):
    return w * age + b

In [None]:
w = 50
b = 100

In [None]:
ages = non_smoker_df.age
estimated_charges = estimate_charges(ages, w, b)

In [None]:
plt.plot(ages, estimated_charges, 'r-o');
plt.xlabel('Age');
plt.ylabel('Estimated Charges');

In [None]:
target = non_smoker_df.charges

plt.plot(ages, estimated_charges, 'r', alpha=0.9);
plt.scatter(ages, target, s=8,alpha=0.8);
plt.xlabel('Age');
plt.ylabel('Charges')
plt.legend(['Estimate', 'Actual']);

In [None]:
def try_parameters(w, b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges

    estimated_charges = estimate_charges(ages, w, b)

    plt.plot(ages, estimated_charges, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual']);

In [None]:
try_parameters(60, 200)

In [None]:
try_parameters(400, 5000)

### Loss/Cost Function

In [None]:
!pip install numpy --quiet

In [None]:
import numpy as np

In [None]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

Let's compute the RMSE for our model with a sample set of weights

In [None]:
w = 50
b = 100

In [None]:
try_parameters(w, b)

In [None]:
targets = non_smoker_df['charges']
predicted = estimate_charges(non_smoker_df.age, w, b)

In [None]:
rmse(targets, predicted)

In [None]:
def try_parameters(w, b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    predictions = estimate_charges(ages, w, b)

    plt.plot(ages, predictions, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Prediction', 'Actual']);

    loss = rmse(target, predictions)
    print("RMSE Loss: ", loss)

In [None]:
try_parameters(50, 100)

### Optimizer



### Linear Regression using Scikit-learn

In [None]:
!pip install scikit-learn --quiet

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges
print('inputs.shape :', inputs.shape)
print('targes.shape :', targets.shape)

In [None]:
model.fit(inputs, targets)

In [None]:
model.predict(np.array([[23],
                        [37],
                        [61]]))

In [None]:
predictions = model.predict(inputs)

In [None]:
predictions

In [None]:
rmse(targets, predictions)

In [None]:
# w
model.coef_

In [None]:
# b
model.intercept_

In [None]:
try_parameters(model.coef_, model.intercept_)

Linear Regression using Multiple Features

In [None]:
# Create inputs and targets
inputs, targets = non_smoker_df[['age']], non_smoker_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

In [None]:
# Create inputs and targets
inputs, targets = non_smoker_df[['age', 'bmi']], non_smoker_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

In [None]:
non_smoker_df.charges.corr(non_smoker_df.bmi)

BMI vs Charges

In [None]:
fig = px.scatter(non_smoker_df, x='bmi', y='charges', title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()

We can also visualize the relationship between all 3 variables "age", "bmi" and "charges" using a 3D scatter plot.

In [None]:
fig = px.scatter_3d(non_smoker_df, x='age', y='bmi', z='charges')
fig.update_traces(marker_size=3, marker_opacity=0.5)
fig.show()

In [None]:
model.coef_, model.intercept_

In [None]:
non_smoker_df.charges.corr(non_smoker_df.children)

Children vs Charges

In [None]:
fig = px.strip(non_smoker_df, x='children', y='charges', title= "Children vs. Charges")
fig.update_traces(marker_size=4, marker_opacity=0.7)
fig.show()

In [None]:
# Create inputs and targets
inputs, targets = non_smoker_df[['age', 'bmi', 'children']], non_smoker_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

In [None]:
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children']], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

Binary Smoker Count

In [None]:
sns.barplot(data=medical_df, x='smoker', y='charges');

In [None]:
smoker_codes = {'no': 0, 'yes': 1}
medical_df['smoker_code'] = medical_df.smoker.map(smoker_codes)

In [None]:
medical_df.charges.corr(medical_df.smoker_code)

In [None]:
medical_df

In [None]:
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children', 'smoker_code']], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

In [None]:
sns.barplot(data=medical_df, x='sex', y='charges')

In [None]:
sex_codes = {'female': 0, 'male': 1}

In [None]:
medical_df['sex_code'] = medical_df.sex.map(sex_codes)

In [None]:
medical_df.charges.corr(medical_df.sex_code)

In [None]:
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children', 'smoker_code', 'sex_code']], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)


### One-hot Encoding

The "region" column contains 4 values, so we'll need to use hot encoding and create a new column for each region.

![](https://i.imgur.com/n8GuiOO.png)


In [None]:
sns.barplot(data=medical_df, x='region', y='charges');

In [None]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit(medical_df[['region']])
enc.categories_

In [None]:
one_hot = enc.transform(medical_df[['region']]).toarray()
one_hot

In [None]:
medical_df[['northeast', 'northwest', 'southeast', 'southwest']] = one_hot

In [None]:
medical_df

In [None]:
# Create inputs and targets
input_cols = ['age', 'bmi', 'children', 'smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
inputs, targets = medical_df[input_cols], medical_df['charges']

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

### Feature Scaling

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
weights_df = pd.DataFrame({
    'feature': np.append(input_cols, 1),
    'weight': np.append(model.coef_, model.intercept_)
})
weights_df

In [None]:
medical_df

Standardise

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
numeric_cols = ['age', 'bmi', 'children']
scaler = StandardScaler()
scaler.fit(medical_df[numeric_cols])

In [None]:
scaler.mean_

In [None]:
scaler.var_

In [None]:
scaled_inputs = scaler.transform(medical_df[numeric_cols])
scaled_inputs

In [None]:
cat_cols = ['smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
categorical_data = medical_df[cat_cols].values

In [None]:
inputs = np.concatenate((scaled_inputs, categorical_data), axis=1)
targets = medical_df.charges

# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

In [None]:
weights_df = pd.DataFrame({
    'feature': np.append(numeric_cols + cat_cols, 1),
    'weight': np.append(model.coef_, model.intercept_)
})
weights_df.sort_values('weight', ascending=False)

Test and Train set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.1)

In [None]:
# Create and train the model
model = LinearRegression().fit(inputs_train, targets_train)

# Generate predictions
predictions_test = model.predict(inputs_test)

# Compute loss to evalute the model
loss = rmse(targets_test, predictions_test)
print('Test Loss:', loss)

Let's compare this with the training loss.

In [None]:
# Generate predictions
predictions_train = model.predict(inputs_train)

# Compute loss to evalute the model
loss = rmse(targets_train, predictions_train)
print('Training Loss:', loss)