In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import requests
import json
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Importing the dataset
df = pd.read_csv('./data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# Update sex to 1 or 0
df['sex_updated'] = df['sex'].map({'male': 0, 'female': 1})

In [4]:
# Update smoker to 1 or 0
df['smoker_updated'] = df['smoker'].map({'yes':1, 'no': 0})
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_updated,smoker_updated
0,19,female,27.9,0,yes,southwest,16884.924,1,1
1,18,male,33.77,1,no,southeast,1725.5523,0,0
2,28,male,33.0,3,no,southeast,4449.462,0,0
3,33,male,22.705,0,no,northwest,21984.47061,0,0
4,32,male,28.88,0,no,northwest,3866.8552,0,0


In [5]:
# Remove unwanted columns
clean_df = df.drop(['region','smoker','children','sex'], axis=1)
clean_df.head()

Unnamed: 0,age,bmi,charges,sex_updated,smoker_updated
0,19,27.9,16884.924,1,1
1,18,33.77,1725.5523,0,0
2,28,33.0,4449.462,0,0
3,33,22.705,21984.47061,0,0
4,32,28.88,3866.8552,0,0


In [6]:
# Check data types
clean_df.dtypes

age                 int64
bmi               float64
charges           float64
sex_updated         int64
smoker_updated      int64
dtype: object

In [7]:
# Convert all data types to float
clean_df = clean_df.astype(float)
clean_df.dtypes

age               float64
bmi               float64
charges           float64
sex_updated       float64
smoker_updated    float64
dtype: object

In [8]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = clean_df['smoker_updated']

# Separate the X variable, the features
X = clean_df.drop(columns=['smoker_updated'])

In [9]:
# Review the y variable Series
y.head()

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: smoker_updated, dtype: float64

In [10]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,age,bmi,charges,sex_updated
0,19.0,27.9,16884.924,1.0
1,18.0,33.77,1725.5523,0.0
2,28.0,33.0,4449.462,0.0
3,33.0,22.705,21984.47061,0.0
4,32.0,28.88,3866.8552,0.0


In [11]:
# Check the balance of our target values
y.value_counts()

0.0    1064
1.0     274
Name: smoker_updated, dtype: int64

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [13]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
X_train_scaled

array([[ 0.47222651, -1.75652513, -0.3456208 ,  1.0246016 ],
       [ 0.54331294, -1.03308239, -0.40048783,  1.0246016 ],
       [ 0.8987451 , -0.94368672,  1.14633152,  1.0246016 ],
       ...,
       [ 1.3252637 , -0.89153925, -0.11777735, -0.97598911],
       [-0.16755139,  2.82086429,  2.72746075,  1.0246016 ],
       [ 1.1120044 , -0.10932713, -0.26065271, -0.97598911]])

In [15]:
# Create a logistic regression model
logreg = LogisticRegression()

In [16]:
# Fit the model on the training data
logreg.fit(X_train_scaled, y_train)

In [17]:
# Make predictions on the test data
y_pred = logreg.predict(X_test_scaled)

In [18]:
# Evaluate the model
accuracy = logreg.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9589552238805971


In [19]:
age = float(input("Enter your age: "))
bmi = float(input("Enter your BMI: "))
charges = float(input("Enter your charges: "))
sex_updated = float(input("Enter your sex_updated (0 for male, 1 for female): "))

In [20]:
# Create a new DataFrame with the user input
user_info = pd.DataFrame({'age': [age], 'bmi': [bmi], 'charges': [charges], 'sex_updated': [sex_updated]})

In [21]:
full_data = pd.concat([X,user_info]).reset_index(drop = True)

In [22]:
# Fitting Standard Scaller
user_scaler = scaler.fit(full_data)
print(user_scaler)

StandardScaler()


In [23]:
# Scaling data
user_input_scaled = user_scaler.transform(full_data)
user_input_scaled[-1]

array([-1.01079318, 36.40995349, -0.99363934,  1.00975633])

In [24]:
# Make prediction on the user data
prediction = logreg.predict([user_input_scaled[-1]])

In [25]:
# Print the prediction
if prediction [0] == 1:
    print("You are predicted to be a smoker.")
else:
    print("You are predicted to be a non-smoker.")

The user is predicted to be a non-smoker.


In [26]:
# Saving model to disk
pickle.dump(logreg, open('smoker_model.pkl','wb'))