In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import joblib


In [3]:
df = pd.read_csv("F:\dataset_for_ML\supervised\chatGptGenerated\complex_pipeline_dataset.csv")

In [4]:
df.head()

Unnamed: 0,id,name,age,gender,income,city,purchased,signup_date,last_login,membership_level
0,1,Jennifer Martinez,18.0,Male,16829.25,South Josephshire,0.0,2021-05-16,2023-09-03,Gold
1,2,Barry Smith,51.0,Female,26270.08,North Elizabeth,0.0,2023-08-17,2024-11-03,Bronze
2,3,Adriana Chang,20.0,Female,54637.94,Desireefurt,0.0,2020-10-20,2025-05-04,Gold
3,4,Kathleen Orozco,38.0,Female,50545.8,Mitchellport,0.0,2022-08-12,,Silver
4,5,Chelsea Phillips,44.0,Male,54028.38,North Herbertmouth,0.0,2021-02-20,2024-03-21,Silver


In [5]:
df.isna().sum()/df.shape[0]

id                  0.00000
name                0.00000
age                 0.02535
gender              0.01925
income              0.05170
city                0.05180
purchased           0.05435
signup_date         0.02745
last_login          0.10210
membership_level    0.01905
dtype: float64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                20000 non-null  int64  
 1   name              20000 non-null  object 
 2   age               19493 non-null  float64
 3   gender            19615 non-null  object 
 4   income            18966 non-null  float64
 5   city              18964 non-null  object 
 6   purchased         18913 non-null  float64
 7   signup_date       19451 non-null  object 
 8   last_login        17958 non-null  object 
 9   membership_level  19619 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 1.5+ MB


In [7]:
df["purchased"].value_counts()

purchased
0.0    12852
1.0     6061
Name: count, dtype: int64

I do not want to work with dates right now, so let's drop it ! 


In [8]:
df.drop(columns=["signup_date","last_login"],inplace=True)

In [9]:
df.sample(10)

Unnamed: 0,id,name,age,gender,income,city,purchased,membership_level
5364,5365,Sarah Khan,24.0,Female,53903.63,Debraport,0.0,Bronze
16798,16799,Lisa Richmond,58.0,Female,51064.84,New Donaldborough,0.0,Gold
1411,1412,Eric Parrish,62.0,Male,63907.09,Philipton,0.0,Silver
4226,4227,Abigail Jackson,68.0,Female,61250.6,East Nancy,1.0,Bronze
15327,15328,Kristine Carlson,,Other,40918.23,East Jean,1.0,Bronze
491,492,Ashley Jacobson,35.0,Female,55155.03,Nancyborough,1.0,Gold
12595,12596,Thomas Rivas,22.0,Female,42171.26,Cameronberg,0.0,Silver
2877,2878,Juan Fleming,22.0,Female,49568.72,,,Bronze
4436,4437,James Hunter,69.0,Other,62229.45,West Amberstad,1.0,Bronze
9161,9162,Fernando Richard,49.0,Male,40849.13,Port Brian,1.0,Gold


In [10]:
df["city"].value_counts()

city
Lake Michael      18
Port John         18
East Michael      17
West Jennifer     17
Port Michael      17
                  ..
Robertsonshire     1
Moranfurt          1
Muellerview        1
East Ninafort      1
Lake Loriport      1
Name: count, Length: 12763, dtype: int64

In [11]:
# THIS IS GENERATED BY AI, YOU CAN USE SIMPLE ONE FOR E.G 
# def group_rare_categories(df,threshold=5):
#     df = df.copy()
#     value_counts = df["city"].value_counts()
#     rare_cat = value_counts[value_counts<threshold].index
#     df["city"] = df["city"].apply(lambda x : "other" if x in rare_cat else x)
    

def group_rare_categories(df, column, threshold=5, replace_with="other", inplace=False):
    """
    Groups rare categories in a DataFrame column into a single category (e.g., "other").

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        column (str): Column name to process.
        threshold (int): Minimum count to keep a category (default=5).
        replace_with (str): Value to replace rare categories with (default="other").
        inplace (bool): If True, modifies the DataFrame in-place (default=False).

    Returns:
        pd.DataFrame (or None if inplace=True)
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame")
    
    if not inplace:
        df = df.copy()
    
    value_counts = df[column].value_counts()
    rare_categories = value_counts[value_counts < threshold].index
    df[column] = np.where(df[column].isin(rare_categories), replace_with, df[column])
    
    return None if inplace else df

In [12]:
df = group_rare_categories(df, column="city", threshold=5)

In [13]:
df["city"].value_counts()

city
other             16271
Port John            18
Lake Michael         18
West Jennifer        17
Port Michael         17
                  ...  
Rodriguezmouth        5
North Stephen         5
South Daniel          5
Brownfurt             5
Garciafurt            5
Name: count, Length: 402, dtype: int64

Let's plan our simple pre-processing steps ! 
What are the things we must do? 
1)  Fill the null values --> we use simpleImputer 
2) scale the income and age column --> we use standard scaling because we wont be caring much about outliers ! 
3) Encoding of categorical columns into numerics, such that ML models can interpret !  

In [14]:
df.select_dtypes("float").describe()

Unnamed: 0,age,income,purchased
count,19493.0,18966.0,18913.0
mean,43.456882,50233.728518,0.320467
std,14.933461,15096.135082,0.466669
min,18.0,148.23,0.0
25%,30.0,39690.985,0.0
50%,44.0,50463.28,0.0
75%,56.0,60465.64,1.0
max,69.0,103040.72,1.0


In [15]:
df["membership_level"].value_counts()

membership_level
Bronze      7986
Silver      6015
Gold        3993
Platinum    1625
Name: count, dtype: int64

In [16]:
num_cols = df[["age","income"]].columns.tolist()
one_hot_cols = df[["gender","city"]].columns.tolist()
ordinal_cols = df[["membership_level"]].columns.tolist()
membership_level = [["Platinum","Gold","Silver","Bronze"]]

In [32]:
one_hot_cols

['gender', 'city']

##### USING PIPELINING ! 

In [18]:
numerical_pipelining = Pipeline(steps=[
    ("simple_imputing",SimpleImputer(strategy="mean")),
    ("Standard Scaling",StandardScaler())
])

In [19]:
one_hot_pipelining = Pipeline(steps=[
    ("simple imputing", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encoding",OneHotEncoder(handle_unknown="ignore",sparse_output=False,drop="first"))
])

In [20]:
ordinal_pipelining = Pipeline(steps=[
    ("simple imputer",SimpleImputer(strategy="most_frequent")),
    ("ordinal encoding",OrdinalEncoder(categories=membership_level))
])

In [21]:
preprocessing = ColumnTransformer(
    transformers=[
        ("for numerical",numerical_pipelining,num_cols),
        ("for one-hot-encoding",one_hot_pipelining,one_hot_cols),
        ("for ordinal",ordinal_pipelining,ordinal_cols),
    ]
)

In [22]:
make_pipeline = Pipeline([
    ("preprocessor",preprocessing),
    ("logisticRegression",LogisticRegression())
])

In [23]:
make_pipeline

In [24]:
df.columns

Index(['id', 'name', 'age', 'gender', 'income', 'city', 'purchased',
       'membership_level'],
      dtype='object')

In [25]:
df = df.dropna(subset=["purchased"])


In [26]:
X = df.loc[:,["name","age","gender","income","city","membership_level"]]
y = df["purchased"]

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [28]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((15130, 6), (3783, 6), (15130,), (3783,))

In [29]:
make_pipeline.fit(X_train,y_train)

In [30]:
make_pipeline.score(X_train,y_train),make_pipeline.score(X_test,y_test)

(0.6853932584269663, 0.6711604546656093)

In [31]:
joblib.dump(make_pipeline, "my_pipeline.pkl")


['my_pipeline.pkl']