# Logistic Regression Example

## Installation

### Package

In [12]:
## Install Library
#!pip install backwards_regression==0.2.0 
#https://pypi.org/project/backwards-regression/

In [13]:
## Load up Library
from backwards_regression import fit_logistic_regression

### Format of Code

In [14]:
#result, dropped_vars = fit_logistic_regression(
    #X,  # X: DataFrame containing predictor variables. Each column should represent a distinct predictor.
    #y,  # y: The response variable. This is the variable you are trying to predict or explain.
    #initial_list=[],  # initial_list: Optional. A list of column names from X that you want to start with in the model. If empty, all columns in X are used.
    #threshold_in=0.01,  # threshold_in: The significance level for adding interaction terms. Interaction terms with p-values below this threshold will be added.
    #threshold_out=0.05,  # threshold_out: The significance level for removing variables. Variables with p-values above this threshold will be removed during the backward elimination.
    #include_interactions=True,  # include_interactions: If set to True, the function will consider interaction terms between predictors for inclusion in the model.
    #verbose=True,  # verbose: If True, provides detailed logging about the function's progress and the variables being added or removed.
    #method='pinv'  # method: The method used to solve the regression. 'pinv' uses the Moore-Penrose pseudoinverse, which is a common choice for OLS regression.
#):
    # Function body...

#method can be
    # 'pinv': Moore-Penrose pseudoinverse - A general, robust method suitable for most cases. It handles singular matrices well.
    # 'qr': QR Decomposition - Good for numerical stability and efficiency, especially useful in cases of multicollinearity or when the matrix has more columns than rows.
    # 'lstsq': Least Squares Solution - Similar to 'pinv', it provides a least squares solution, useful for overdetermined systems. It's another robust option for general use.
    # 'cholesky': Cholesky Decomposition - Efficient and fast for positive definite matrices. Best used when the matrix is symmetric and positive definite.


## Logistic Regression

### Data Exploration

In [15]:
## Sample Data

import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Generate synthetic financial data for logistic regression
def generate_financial_data_binary(n=1000):
    # Features
    age = np.random.normal(40, 10, n)
    income = np.random.normal(50000, 10000, n)
    savings = np.random.normal(20000, 5000, n)
    debt = np.random.normal(10000, 3000, n)
    credit_score = np.random.normal(700, 50, n)

    # Additional variables
    assets = np.random.normal(30000, 8000, n)
    liabilities = np.random.normal(12000, 4000, n)
    monthly_expenses = np.random.normal(4000, 1000, n)

    # Generate a binary target variable indicating good (1) or not good (0) financial risk
    target = (0.5 * age + 0.2 * income - 0.3 * savings + 0.1 * debt + 0.15 * credit_score
              + np.random.normal(0, 5, n)) > 0

    # Create DataFrame
    data = pd.DataFrame({
        'Age': age,
        'Income': income,
        'Savings': savings,
        'Debt': debt,
        'CreditScore': credit_score,
        'Assets': assets,
        'Liabilities': liabilities,
        'MonthlyExpenses': monthly_expenses,
        'Target': target.astype(int)  # Convert to integer (0 or 1)
    })

    return data

# Generate financial dataset for logistic regression
financial_df = generate_financial_data_binary()

In [16]:
## Separate features (X) and binary target variable (y)
X = financial_df.drop('Target', axis=1)  # Features
y = financial_df['Target']  # Binary target variable

# Display the features (X) DataFrame
print("Features (X):")
print(X.head())

# Display the binary target variable (y) Series
print("\nBinary target variable (y):")
print(y.head())

Features (X):
         Age        Income       Savings          Debt  CreditScore  \
0  44.967142  63993.554366  16624.108625   4276.577326   656.825320   
1  38.617357  59246.336829  19277.406646   7418.844968   698.439826   
2  46.476885  50596.303699  16037.900395   8759.183400   700.900844   
3  55.230299  43530.632223  18460.192352  15663.062972   723.631517   
4  37.658466  56982.233136  10531.926665  11669.659374   631.657082   

         Assets   Liabilities  MonthlyExpenses  
0  26609.922544   7543.674567      4785.185082  
1  26372.687133   9476.276665      2222.319038  
2  15634.854619   8231.759266      4714.745650  
3  27359.278466   9808.016725      3766.275942  
4  35862.632655  11143.398757      4707.457711  

Binary target variable (y):
0    1
1    1
2    1
3    1
4    1
Name: Target, dtype: int32


### With Interactions Included

In [17]:
## With interactions included - set to True
result, dropped_vars = fit_logistic_regression(X, y, threshold_in=0.01, threshold_out=0.05, include_interactions=True, verbose=True, method='bfgs')

INFO:root:Dropped feature CreditScore with p-value 0.9831939481100389
INFO:root:Iteration 2: Current features: ['Age', 'Income', 'Savings', 'Debt', 'Assets', 'Liabilities', 'MonthlyExpenses']
INFO:root:Dropped feature Age with p-value 0.7546799739975825
INFO:root:Iteration 3: Current features: ['Income', 'Savings', 'Debt', 'Assets', 'Liabilities', 'MonthlyExpenses']
INFO:root:Dropped feature MonthlyExpenses with p-value 0.9867344468142598
INFO:root:Iteration 4: Current features: ['Income', 'Savings', 'Debt', 'Assets', 'Liabilities']
INFO:root:Dropped feature Liabilities with p-value 0.9322638162546516
INFO:root:Iteration 5: Current features: ['Income', 'Savings', 'Debt', 'Assets']
INFO:root:Dropped feature Assets with p-value 0.9963423366516404
INFO:root:Iteration 6: Current features: ['Income', 'Savings', 'Debt']
INFO:root:Dropped feature Debt with p-value 0.9987833843691373
INFO:root:Iteration 7: Current features: ['Income', 'Savings']
INFO:root:Dropped interaction term Income * Savi

         Current function value: 0.000243
         Iterations: 35
         Function evaluations: 55
         Gradient evaluations: 45
         Current function value: 0.000088
         Iterations: 35
         Function evaluations: 55
         Gradient evaluations: 45
         Current function value: 0.000015
         Iterations: 35
         Function evaluations: 55
         Gradient evaluations: 45
         Current function value: 0.000001
         Iterations: 35
         Function evaluations: 59
         Gradient evaluations: 49
         Current function value: 0.000032
         Iterations: 35
         Function evaluations: 58
         Gradient evaluations: 48
         Current function value: 0.000001
         Iterations: 35
         Function evaluations: 50
         Gradient evaluations: 40
         Current function value: 0.007959
         Iterations: 35
         Function evaluations: 50
         Gradient evaluations: 40
         Current function value: 0.693147
         Iterations:

In [18]:
## Print Selected features
print("Final included features:", result)

Final included features: ['Income', 'Savings']


In [19]:
## Print Eliminated features
print("Dropped variables:", dropped_vars)

Dropped variables: ['CreditScore', 'Age', 'MonthlyExpenses', 'Liabilities', 'Assets', 'Debt', 'Income * Savings']


### Without Interactions Included

In [20]:
## Without interactions included - set to False
result, dropped_vars = fit_logistic_regression(X, y, threshold_in=0.01, threshold_out=0.05, include_interactions=False, verbose=True, method='bfgs')

INFO:root:Dropped feature CreditScore with p-value 0.9831939481100389
INFO:root:Iteration 2: Current features: ['Age', 'Income', 'Savings', 'Debt', 'Assets', 'Liabilities', 'MonthlyExpenses']
INFO:root:Dropped feature Age with p-value 0.7546799739975825
INFO:root:Iteration 3: Current features: ['Income', 'Savings', 'Debt', 'Assets', 'Liabilities', 'MonthlyExpenses']
INFO:root:Dropped feature MonthlyExpenses with p-value 0.9867344468142598
INFO:root:Iteration 4: Current features: ['Income', 'Savings', 'Debt', 'Assets', 'Liabilities']


         Current function value: 0.000243
         Iterations: 35
         Function evaluations: 55
         Gradient evaluations: 45
         Current function value: 0.000088
         Iterations: 35
         Function evaluations: 55
         Gradient evaluations: 45
         Current function value: 0.000015
         Iterations: 35
         Function evaluations: 55
         Gradient evaluations: 45
         Current function value: 0.000001
         Iterations: 35
         Function evaluations: 59
         Gradient evaluations: 49


INFO:root:Dropped feature Liabilities with p-value 0.9322638162546516
INFO:root:Iteration 5: Current features: ['Income', 'Savings', 'Debt', 'Assets']


         Current function value: 0.000032

INFO:root:Dropped feature Assets with p-value 0.9963423366516404
INFO:root:Iteration 6: Current features: ['Income', 'Savings', 'Debt']
INFO:root:Dropped feature Debt with p-value 0.9987833843691373
INFO:root:Iteration 7: Current features: ['Income', 'Savings']



         Iterations: 35
         Function evaluations: 58
         Gradient evaluations: 48
         Current function value: 0.000001
         Iterations: 35
         Function evaluations: 50
         Gradient evaluations: 40
         Current function value: 0.007959
         Iterations: 35
         Function evaluations: 50
         Gradient evaluations: 40


In [21]:
## Print Selected features
print("Final included features:", result)

Final included features: ['Income', 'Savings']


In [22]:
print("Dropped variables:", dropped_vars)

Dropped variables: ['CreditScore', 'Age', 'MonthlyExpenses', 'Liabilities', 'Assets', 'Debt']
