# Linear Regression Example

## Installation

### Package

In [1]:
## Install Library 
#!pip install backwards_regression==0.2.0 

#https://pypi.org/project/backwards-regression/

In [2]:
## Load up Library
from backwards_regression import fit_linear_regression

### Format of Code

In [3]:
#result, dropped_vars = fit_linear_regression(
    #X,  # The DataFrame containing the predictor variables.
    #y,  # The response variable, typically a Series or a one-dimensional array.
    #initial_list=[],  # Optional. A list of initial variables to consider. If empty, all variables in X are used.
    #threshold_in=0.01,  # The significance level for adding variables to the model. Variables with p-values lower than this threshold will be considered.
    #threshold_out=0.05,  # The significance level for removing variables from the model. Variables with p-values higher than this threshold will be removed.
    #include_interactions=True,  # Whether to consider interaction terms between variables. If True, interaction terms are evaluated and added if significant.
    #verbose=True,  # Controls the verbosity of the output. If True, logs detailed information about the regression process.
    #method='bfgs'  # The optimization method used in fitting the logistic regression model. Default is 'bfgs', but can be changed to other methods like 'newton', 'nm', etc.
#):
    # Function body...
# method can be 
        # 'bfgs': Broyden–Fletcher–Goldfarb–Shanno algorithm - Good default choice, handles a wide range of problems well.
        # 'newton': Newton-Raphson method - Uses first and second derivatives, efficient for well-behaved problems.
        # 'nm': Nelder-Mead simplex algorithm - Derivative-free, robust, suitable for problems where derivatives are unreliable.
        # 'cg': Conjugate Gradient - Good for large-scale problems, an iterative method that requires careful tuning.
        # 'ncg': Newton-Conjugate Gradient - Combines Newton-Raphson and Conjugate Gradient, efficient for large-scale problems.
        # 'lbfgs': Limited-memory BFGS - Suitable for problems with a large number of parameters, uses limited memory.
        # 'powell': Powell’s method - A derivative-free optimizer, robust but potentially slower than gradient-based methods.


## Linear Regression

### Data Exploration

In [4]:
## Sample Data

import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Generate synthetic financial data for linear regression
def generate_financial_data(n=1000):
    # Features
    age = np.random.normal(40, 10, n)
    income = np.random.normal(50000, 10000, n)
    savings = np.random.normal(20000, 5000, n)
    debt = np.random.normal(10000, 3000, n)
    credit_score = np.random.normal(700, 50, n)

    # Additional variables
    assets = np.random.normal(30000, 8000, n)
    liabilities = np.random.normal(12000, 4000, n)
    monthly_expenses = np.random.normal(4000, 1000, n)

    # Generate a linear relationship with some noise
    target = 0.5 * age + 0.2 * income - 0.3 * savings + 0.1 * debt + 0.15 * credit_score + np.random.normal(0, 5, n)

    # Create DataFrame
    data = pd.DataFrame({
        'Age': age,
        'Income': income,
        'Savings': savings,
        'Debt': debt,
        'CreditScore': credit_score,
        'Assets': assets,
        'Liabilities': liabilities,
        'MonthlyExpenses': monthly_expenses,
        'Target': target
    })

    return data

# Generate financial dataset for linear regression
financial_df = generate_financial_data()


In [5]:
## Separate features (X) and target variable (y)
X = financial_df.drop('Target', axis=1)  # Features
y = financial_df['Target']  # Target variable

# Display the features (X) DataFrame
print("Features (X):")
print(X.head())

# Display the target variable (y) Series
print("\nTarget variable (y):")
print(y.head())

Features (X):
         Age        Income       Savings          Debt  CreditScore  \
0  44.967142  63993.554366  16624.108625   4276.577326   656.825320   
1  38.617357  59246.336829  19277.406646   7418.844968   698.439826   
2  46.476885  50596.303699  16037.900395   8759.183400   700.900844   
3  55.230299  43530.632223  18460.192352  15663.062972   723.631517   
4  37.658466  56982.233136  10531.926665  11669.659374   631.657082   

         Assets   Liabilities  MonthlyExpenses  
0  26609.922544   7543.674567      4785.185082  
1  26372.687133   9476.276665      2222.319038  
2  15634.854619   8231.759266      4714.745650  
3  27359.278466   9808.016725      3766.275942  
4  35862.632655  11143.398757      4707.457711  

Target variable (y):
0    8359.978261
1    6929.486270
2    6311.320656
3    4874.108572
4    9523.801644
Name: Target, dtype: float64


### With Interactions Included

In [6]:
## With interactions included - set to True
result, dropped_vars = fit_linear_regression(X, y, threshold_in=0.01, threshold_out=0.05, include_interactions=True, verbose=True, method='pinv')

INFO:root:Dropped feature Liabilities with p-value 0.9602872163010282
INFO:root:Iteration 2: Current features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore', 'Assets', 'MonthlyExpenses']
INFO:root:Dropped feature Assets with p-value 0.6417684824230647
INFO:root:Iteration 3: Current features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore', 'MonthlyExpenses']
INFO:root:Dropped feature MonthlyExpenses with p-value 0.38515644186354714
INFO:root:Iteration 4: Current features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore']
INFO:root:Dropped interaction term Age * Income with p-value 0.9917758827207849
INFO:root:Dropped interaction term Age * Savings with p-value 0.610145390391784
INFO:root:Dropped interaction term Age * Debt with p-value 0.5463331331649792
INFO:root:Dropped interaction term Age * CreditScore with p-value 0.3218387082529018
INFO:root:Dropped interaction term Income * Savings with p-value 0.01585052347050218
INFO:root:Dropped interaction term Income * Debt with p

In [7]:
## Print Selected features
print("Final included features:", result)

Final included features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore']


In [8]:
## Print Eliminated features
print("Dropped variables:", dropped_vars)

Dropped variables: ['Liabilities', 'Assets', 'MonthlyExpenses', 'Age * Income', 'Age * Savings', 'Age * Debt', 'Age * CreditScore', 'Income * Savings', 'Income * Debt', 'Income * CreditScore', 'Savings * Debt', 'Savings * CreditScore', 'Debt * CreditScore']


### Without Interactions Included

In [9]:
## Without interactions included - set to False
result, dropped_vars = fit_linear_regression(X, y, threshold_in=0.01, threshold_out=0.05, include_interactions=False, verbose=True,method='pinv')

INFO:root:Dropped feature Liabilities with p-value 0.9602872163010282
INFO:root:Iteration 2: Current features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore', 'Assets', 'MonthlyExpenses']
INFO:root:Dropped feature Assets with p-value 0.6417684824230647
INFO:root:Iteration 3: Current features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore', 'MonthlyExpenses']
INFO:root:Dropped feature MonthlyExpenses with p-value 0.38515644186354714
INFO:root:Iteration 4: Current features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore']


In [10]:
## Print Selected features
print("Final included features:", result)

Final included features: ['Age', 'Income', 'Savings', 'Debt', 'CreditScore']


In [11]:
print("Dropped variables:", dropped_vars)

Dropped variables: ['Liabilities', 'Assets', 'MonthlyExpenses']
