# Logistic Regression Analysis Using Pipline

## Import necessary libraries

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Read Data

In [2]:
df=pd.read_excel("purchase_data.xlsx")
df.head(3)

Unnamed: 0,age,salary,gender,purchased
0,56.0,136748,male,1
1,46.0,25287,female,1
2,32.0,146593,male,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        950 non-null    float64
 1   salary     1000 non-null   int64  
 2   gender     1000 non-null   object 
 3   purchased  1000 non-null   int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 31.4+ KB


In [4]:
df.isna().sum()

age          50
salary        0
gender        0
purchased     0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isna().sum()

age          0
salary       0
gender       0
purchased    0
dtype: int64

# Labelling

In [7]:
# Separating features and target variable
X = df.drop('purchased', axis=1)
y = df['purchased']

## Defining numerical and categorical columns

In [8]:
# Defining numerical and categorical columns
numeric_features = ['age', 'salary']
categorical_features = ['gender']

## Set the Pipline

In [9]:
# Pipeline for processing numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('scaler', StandardScaler())  # Scale the numerical data
])

In [10]:
# Pipeline for processing categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with the most frequent value
    ('onehot', OneHotEncoder())  # Apply one-hot encoding to categorical variables
])

In [11]:
# Combining both numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
# Creating the pipeline: preprocessing and model training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Data preprocessing
    ('classifier', LogisticRegression())  # Logistic regression model
])

## Split Data as Train & Test

In [13]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Train the Model

In [14]:
# Training the model
pipeline.fit(X_train, y_train)

## Prediction on Test Data

In [15]:
# Predicting on test data
y_pred = pipeline.predict(X_test)

## Model Accuracy

In [16]:
# Calculating the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 0.78


# Get Prediction on New Data

In [17]:
# New data (the data you want to predict on)
new_data = pd.DataFrame({
    'age': [40],
    'salary': [65000],
    'gender': ['male']
})

In [18]:
# Making a prediction on the new data
new_prediction = pipeline.predict(new_data)
new_prediction_proba = pipeline.predict_proba(new_data)

# Printing the class probabilities
print(f"Predicted class for the new data: {new_prediction[0]}")
print(f"Probability of class 0: {new_prediction_proba[0][0]:.2f}")
print(f"Probability of class 1: {new_prediction_proba[0][1]:.2f}")

Predicted class for the new data: 0
Probability of class 0: 0.89
Probability of class 1: 0.11


# Save the Model

In [19]:
import joblib

# Save the model
joblib.dump(pipeline, 'pipeline_model.pkl')

['pipeline_model.pkl']

# STREAMLIT

In [20]:
%%writefile Streamlit_app.py
import streamlit as st
import pandas as pd
import joblib

# Load the saved model
pipeline = joblib.load('pipeline_model.pkl')

# Streamlit title
st.title("Purchase Prediction Model")

# User input
age = st.number_input("Age", min_value=18, max_value=100, value=30)
salary = st.number_input("Salary", min_value=10000, max_value=200000, value=50000)
gender = st.selectbox("Gender", options=["male", "female"])

# Create a DataFrame from the user input
new_data = pd.DataFrame({
    'age': [age],
    'salary': [salary],
    'gender': [gender]
})

# Make a prediction
if st.button("Make Prediction"):
    prediction = pipeline.predict(new_data)
    prediction_proba = pipeline.predict_proba(new_data)

    # Display the results
    st.write(f"Predicted class: {prediction[0]}")
    st.write(f"Probability of class 0: {prediction_proba[0][0]:.2f}")
    st.write(f"Probability of class 1: {prediction_proba[0][1]:.2f}")

Overwriting Streamlit_app.py


In [21]:
!streamlit run Streamlit_app.py

^C


In [None]:
# Done