# Linear Regression Data Prep

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import ml_utils as mt
import pandas as pd 
import numpy as np

In [3]:
data_train=pd.read_csv(r'./loan_data_train.csv')

In [4]:
data_train.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,79542.0,25000,25000.0,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years
1,75473.0,19750,19750.0,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.5,710-714,14,19070,3.0,4 years
2,67265.0,2100,2100.0,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.0,690-694,13,893,1.0,< 1 year
3,80167.0,28000,28000.0,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years


In [5]:
def dtr(orig_col):
    
    mod_col=orig_col.str.replace('%','')
    mod_col=pd.to_numeric(mod_col,errors='coerce')
    
    return mod_col
    

def fico(orig_col):
    k=orig_col.str.split('-',expand=True)
    
    for i in [0,1]:
        k[i]=pd.to_numeric(k[i],errors='coerce')
    
    mod_col=0.5*(k[0]+k[1])
    
    return mod_col
    

def el(orig_col):
    
    inter_col=orig_col.str.replace('10+ years','10',regex=False)
    inter_col=inter_col.str.replace('< 1 year','0',regex=False)
    inter_col=inter_col.str.replace('years','').str.replace('year','')
    
    mod_col=pd.to_numeric(inter_col,errors='coerce')
    
    return mod_col


cat_to_dummies=['Loan.Length','Loan.Purpose','State','Home.Ownership']
cat_to_num=['Amount.Requested','Open.CREDIT.Lines','Revolving.CREDIT.Balance']
simple_num=['Monthly.Income','Inquiries.in.the.Last.6.Months']
custom_func_dict={'Debt.To.Income.Ratio':dtr,'FICO.Range':fico,'Employment.Length':el}

dp=mt.DataPipe(cat_to_dummies=cat_to_dummies,
                 cat_to_num=cat_to_num,
                 simple_num=simple_num,
                 custom_func_dict=custom_func_dict)

In [6]:
dp.fit(data_train)

<ml_utils.DataPipe at 0x1636f3dc0>

In [7]:
x_train=dp.transform(data_train)

In [8]:
y_train=data_train['Interest.Rate'].str.replace('%','').astype(float)

# Estimating Model Coefficients with Closed Form Solution

In [None]:
x_la=x_train.copy()

In [None]:
x_la.insert(0,'constant',1)

In [None]:
x_t_x=np.dot(x_la.T,x_la)

x_t_x_inv=np.linalg.inv(x_t_x)

In [None]:
y_t_x=np.dot(y_train.T,x_la)

In [None]:
y_t_x

w_la=np.dot(x_t_x_inv,y_t_x)

w_la

# Estimating Model Coefficient with Gradient Descent

In [None]:
def mypred(x,w):
    
    y_hat=x@w
    return(y_hat)


def myerror(y,x,w):
    
    y_hat=mypred(x,w)
    errors=y-y_hat
    return(errors)


def mycost(y,x,w):
    
    errors=myerror(y,x,w)
    
    cost=errors.T@errors
    
    return(cost)


def gradient(y,x,w):
    
    errors=myerror(y,x,w)
    grad=-x.T@errors/x.shape[0]
    return(grad)

def my_lr_sgd(y,x,learning_rate,num_steps):
    
    weights=np.zeros(x.shape[1])
    decay_rate = 0.9
    prev_cost = float('inf')
 
    for i in np.arange(num_steps):
        rand_ind=np.random.choice(range(x.shape[0]),100)
        y_sub=y[rand_ind]
        x_sub=x.iloc[rand_ind,:]
        
        gd=gradient(y_sub,x_sub,weights)
        
        weights -= learning_rate*gd
        
        curr_cost = mycost(y, x, weights)

        # Stop if converged
        if np.abs(prev_cost - curr_cost) < 1e-6:
            print(f"Converged at iteration {i}")
            break
        
        
        if i%20000==0:
            print(i,curr_cost)
            learning_rate*=decay_rate
            
        prev_cost = curr_cost
            
    return weights

In [None]:
mycost(y_train,x_la,w_la)

$$
\begin{align*}
w_j&=\frac{{w_j}'}{\sigma_j} \quad \forall j \in \{1,2,\cdots,p\}\\
w_0&={w_0}'-\sum_{j=1}^p\frac{\mu_j{w_j}'}{\sigma_j}
\end{align*}
$$

In [None]:
def convert_to_non_standardized_weights(scaler, standardized_weights):
    # Extract mean and scale (standard deviation) from the StandardScaler object
    means = scaler.mean_
    scales = scaler.scale_

    # Initialize the array to store the non-standardized weights
    non_standardized_weights = np.zeros_like(standardized_weights)

    # Compute the weights for the non-standardized data
    # w_j_non_standardized = w_j_standardized / scale_j for j > 0
    non_standardized_weights[1:] = standardized_weights[1:] / scales

    # Adjust the intercept term (w_0)
    w0_adjustment = np.sum((means * standardized_weights[1:]) / scales)
    non_standardized_weights[0] = standardized_weights[0] - w0_adjustment

    return non_standardized_weights

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train)
x_sd=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
x_sd.insert(0,'constant',1)

In [None]:
w_sgd=my_lr_sgd(y_train,x_sd,.01,500000)
# see how the cost improves fast initially and then as we reach towards the optimal point
# progresss slows down, this goes on for awhile , lets be patient for 10-15 mins

In [None]:
w_sgd=convert_to_non_standardized_weights(scaler,w_sgd)

In [None]:
mycost(y_train,x_la,w_sgd)

In [None]:
# you can see that we have been able to reach cost levels almost as good as closed form solution
# try other optimisers that we discussed and see how those fare 

In [None]:
list(zip([1]+list(x_train.columns),list(w_la),list(w_sgd)))

# sklearn estimates

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
sk_lr=LinearRegression()

In [20]:
sk_lr.fit(x_train,y_train)

In [21]:
w_sk=[sk_lr.intercept_]+list(sk_lr.coef_)

In [22]:
list(zip([1]+list(x_train.columns),list(w_la),list(w_gd),w_sk))

NameError: name 'w_la' is not defined

# Lasso [Linear Regression with $l_1$ and $l_2$ Penalty ] with gradient descent

code below can be used with some fixed value for penalty parameter $\alpha$ with either $l_1$ or $l_2$ penalty at a time , do experiment around with it.

In [None]:
def mypred(x, w):
    y_hat = x @ w
    return y_hat

def myerror(y, x, w):
    y_hat = mypred(x, w)
    errors = y - y_hat
    return errors

def mycost(y, x, w, alpha=0.1, penalty='l2'):
    """
    Computes the cost function with either L1 or L2 regularization.

    Parameters:
    - y: actual values
    - x: input data
    - w: weights
    - alpha: regularization strength
    - penalty: 'l1' for Lasso (L1) or 'l2' for Ridge (L2)

    Returns:
    - cost: the regularized cost function value
    """
    errors = myerror(y, x, w)
    
    # Basic cost (squared error)
    cost = errors.T @ errors / (2 * x.shape[0])
    
    # Apply either L1 or L2 penalty
    if penalty == 'l1':
        l1_penalty = np.sum(np.abs(w))
        total_cost = cost + alpha * l1_penalty
    elif penalty == 'l2':
        l2_penalty = np.sum(w ** 2)
        total_cost = cost + alpha * l2_penalty
    else:
        raise ValueError("Invalid penalty type. Use 'l1' or 'l2'.")
    
    return total_cost

def gradient(y, x, w, alpha=0.1, penalty='l2'):
    """
    Computes the gradient with either L1 or L2 regularization.

    Parameters:
    - y: actual values
    - x: input data
    - w: weights
    - alpha: regularization strength
    - penalty: 'l1' for Lasso (L1) or 'l2' for Ridge (L2)

    Returns:
    - grad: the gradient of the cost function with L1 or L2 regularization
    """
    errors = myerror(y, x, w)
    
    # Gradient of the error term
    grad = -x.T @ errors / x.shape[0]
    
    # Apply either L1 or L2 regularization gradient
    if penalty == 'l1':
        l1_grad = np.sign(w)  # Subgradient for L1
        total_grad = grad + alpha * l1_grad
    elif penalty == 'l2':
        l2_grad = 2 * w
        total_grad = grad + alpha * l2_grad
    else:
        raise ValueError("Invalid penalty type. Use 'l1' or 'l2'.")
    
    return total_grad

## Ridge and Lasso [linear regression with $l_2$ and $l_1$ penalty] with sklearn 

In [9]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [10]:
alphas_ridge=np.linspace(1,100,100)

these are the values of $\alpha$ we want to start our experiment with. `GridSearchCV` scores all these values with cross validation and we can extract with function `ml_utils.report` which value combination scores the best. We need to pass the parameters values that we want to experiment with as a dictionary

In [11]:
params_ridge={'alpha':alphas_ridge} # key values here need to match exactly as the argument in the modeling functions

In [12]:
lr_ridge=Ridge() # this is the model for which we want to experiment with parameter values 

In [13]:
gs_ridge=GridSearchCV(lr_ridge,
               param_grid=params_ridge,
               cv=10, # 10 fold cross validation 
               scoring='neg_mean_absolute_error', # all models are scored with this criterion
               verbose=20, # higher number should print more info while fitting [currently doesnt work well with jupyter]
               n_jobs=-1) # allows for parallel processing

In [14]:
gs_ridge.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [15]:
mt.report(gs_ridge.cv_results_,5)

Model with rank: 1
Mean validation score: -1.604686 (std: 0.117972)
Parameters: {'alpha': 49.0}

Model with rank: 2
Mean validation score: -1.604688 (std: 0.118008)
Parameters: {'alpha': 50.0}

Model with rank: 3
Mean validation score: -1.604691 (std: 0.117942)
Parameters: {'alpha': 48.0}

Model with rank: 4
Mean validation score: -1.604693 (std: 0.118045)
Parameters: {'alpha': 51.0}

Model with rank: 5
Mean validation score: -1.604698 (std: 0.118082)
Parameters: {'alpha': 52.0}



we see that the best value for alpha is `49` , we can further finetune this is by exploring the close range around this value

In [None]:
params_ridge={'alpha':np.linspace(48,50,25)} 

In [None]:
params_ridge

In [None]:
gs_ridge=GridSearchCV(lr_ridge,
               param_grid=params_ridge,
               cv=10, 
               scoring='neg_mean_absolute_error',
               verbose=20, 
               n_jobs=-1) 

In [None]:
gs_ridge.fit(x_train,y_train)

In [None]:
mt.report(gs_ridge.cv_results_,5)

we can build the model for best parameter values separately

In [24]:
ridge_final=Ridge(**{'alpha': 49})

In [25]:
ridge_final.fit(x_train,y_train)

look at the weights estimate , none of them have been suppressed to exactly zero , but you will do see many have been suppressed b ya large factor

In [26]:
ridge_wt_comparison=pd.DataFrame({'features':['bias']+list(x_train.columns),
                                  'simple_model':w_sk,'ridge_wts':[ridge_final.intercept_]+list(ridge_final.coef_)})

In [27]:
ridge_wt_comparison['suppression_ratio_l2']=ridge_wt_comparison['simple_model']/ridge_wt_comparison['ridge_wts']

In [28]:
ridge_wt_comparison

Unnamed: 0,features,simple_model,ridge_wts,suppression_ratio_l2
0,bias,74.233741,73.661457,1.007769
1,Loan.Length_36 months,1.292842,-1.406586,-0.919135
2,Loan.Length_60 months,4.459656,1.519287,2.93536
3,Loan.Purpose_debt_consolidation,-1.496679,-0.328716,4.553106
4,Loan.Purpose_credit_card,-1.613132,-0.413548,3.900716
5,Loan.Purpose_other,-0.680605,0.376963,-1.805494
6,Loan.Purpose_home_improvement,-1.416846,-0.187597,7.552624
7,Loan.Purpose_major_purchase,-1.142537,0.004419,-258.564333
8,Loan.Purpose_small_business,-1.0028,0.102684,-9.765916
9,Loan.Purpose___other__,-0.732443,0.218014,-3.359616


you can make prediction with the fitted model in a similar manner 

In [None]:
ridge_final.predict(x_train)

if you want to make prediction on test, data, transform it first with the datapipe we had fitted earlier for the same data

now lets see how $l_1$ penalty affects things

In [29]:
from sklearn.linear_model import Lasso


In [30]:
alphas_lasso=np.linspace(1,100,100)
params_lasso={'alpha':alphas_lasso}

In [31]:
lr_lasso=Lasso()
gs_lasso=GridSearchCV(lr_lasso,
               param_grid=params_lasso,
               cv=10,
               scoring='neg_mean_absolute_error',
               verbose=20,
               n_jobs=-1)

In [32]:
gs_lasso.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
[CV 3/10; 1/100] START alpha=1.0................................................
[CV 3/10; 1/100] END ................alpha=1.0;, score=-1.759 total time=   0.0s
[CV 6/10; 2/100] START alpha=2.0................................................
[CV 6/10; 2/100] END ................alpha=2.0;, score=-1.628 total time=   0.0s
[CV 7/10; 3/100] START alpha=3.0................................................
[CV 7/10; 3/100] END ................alpha=3.0;, score=-1.423 total time=   0.0s
[CV 10/10; 3/100] START alpha=3.0...............................................
[CV 10/10; 3/100] END ...............alpha=3.0;, score=-1.655 total time=   0.0s
[CV 4/10; 4/100] START alpha=4.0................................................
[CV 4/10; 4/100] END ................alpha=4.0;, score=-1.649 total time=   0.0s
[CV 7/10; 4/100] START alpha=4.0................................................
[CV 7/10; 4/100] END ................alpha=4

[CV 4/10; 1/100] START alpha=1.0................................................
[CV 4/10; 1/100] END ................alpha=1.0;, score=-1.645 total time=   0.0s
[CV 1/10; 2/100] START alpha=2.0................................................
[CV 1/10; 2/100] END ................alpha=2.0;, score=-1.726 total time=   0.0s
[CV 4/10; 2/100] START alpha=2.0................................................
[CV 4/10; 2/100] END ................alpha=2.0;, score=-1.647 total time=   0.0s
[CV 4/10; 3/100] START alpha=3.0................................................
[CV 4/10; 3/100] END ................alpha=3.0;, score=-1.648 total time=   0.0s
[CV 9/10; 3/100] START alpha=3.0................................................
[CV 9/10; 3/100] END ................alpha=3.0;, score=-1.521 total time=   0.0s
[CV 5/10; 4/100] START alpha=4.0................................................
[CV 5/10; 4/100] END ................alpha=4.0;, score=-1.427 total time=   0.0s
[CV 9/10; 6/100] START alpha

[CV 6/10; 69/100] START alpha=69.0..............................................
[CV 6/10; 69/100] END ..............alpha=69.0;, score=-1.611 total time=   0.0s
[CV 7/10; 69/100] START alpha=69.0..............................................
[CV 7/10; 69/100] END ..............alpha=69.0;, score=-1.419 total time=   0.0s
[CV 8/10; 69/100] START alpha=69.0..............................................
[CV 8/10; 69/100] END ..............alpha=69.0;, score=-1.564 total time=   0.0s
[CV 7/10; 79/100] START alpha=79.0..............................................
[CV 7/10; 79/100] END ..............alpha=79.0;, score=-1.419 total time=   0.0s
[CV 8/10; 79/100] START alpha=79.0..............................................
[CV 8/10; 79/100] END ..............alpha=79.0;, score=-1.564 total time=   0.0s
[CV 5/10; 80/100] START alpha=80.0..............................................
[CV 5/10; 80/100] END ..............alpha=80.0;, score=-1.414 total time=   0.0s
[CV 6/10; 80/100] START alph

In [33]:
mt.report(gs_lasso.cv_results_,5)

Model with rank: 1
Mean validation score: -1.938738 (std: 0.135372)
Parameters: {'alpha': 1.0}

Model with rank: 2
Mean validation score: -1.939281 (std: 0.135008)
Parameters: {'alpha': 2.0}

Model with rank: 3
Mean validation score: -1.940121 (std: 0.134499)
Parameters: {'alpha': 3.0}

Model with rank: 4
Mean validation score: -1.941127 (std: 0.133858)
Parameters: {'alpha': 4.0}

Model with rank: 5
Mean validation score: -1.942463 (std: 0.133218)
Parameters: {'alpha': 5.0}



you can see here that the best value comes at the left edge, we can probably improve our results by expanding our experiment values on that side

In [34]:
lr_lasso=Lasso()
alphas_lasso=np.linspace(0,2,100)
params_lasso={'alpha':alphas_lasso}
gs_lasso=GridSearchCV(lr_lasso,
               param_grid=params_lasso,
               cv=10,
               scoring='neg_mean_absolute_error',
               verbose=20,
               n_jobs=-1)
gs_lasso.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 4/10; 81/100] START alpha=81.0..............................................
[CV 4/10; 81/100] END ..............alpha=81.0;, score=-1.647 total time=   0.0s
[CV 1/10; 82/100] START alpha=82.0..............................................
[CV 1/10; 82/100] END ..............alpha=82.0;, score=-1.747 total time=   0.0s
[CV 2/10; 82/100] START alpha=82.0..............................................
[CV 2/10; 82/100] END ..............alpha=82.0;, score=-1.709 total time=   0.0s
[CV 1/10; 83/100] START alpha=83.0..............................................
[CV 1/10; 83/100] END ..............alpha=83.0;, score=-1.747 total time=   0.0s
[CV 2/10; 83/100] START alpha=83.0..............................................
[CV 2/10; 83/100] END ..............alpha=83.0;, score=-1.708 total time=   0.0s
[CV 1/10; 84/100] START alpha=84.0..............................................
[CV 1/10; 84/100] END ..............alpha=84.0;, score=-1.747 total time=   0.0s
[CV 2/10; 84/100] START alph

  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 4/10; 86/100] START alpha=86.0..............................................
[CV 4/10; 86/100] END ..............alpha=86.0;, score=-1.648 total time=   0.0s
[CV 3/10; 88/100] START alpha=88.0..............................................
[CV 3/10; 88/100] END ..............alpha=88.0;, score=-1.763 total time=   0.0s
[CV 4/10; 88/100] START alpha=88.0..............................................
[CV 4/10; 88/100] END ..............alpha=88.0;, score=-1.648 total time=   0.0s
[CV 7/10; 89/100] START alpha=89.0..............................................
[CV 7/10; 89/100] END ..............alpha=89.0;, score=-1.419 total time=   0.0s
[CV 8/10; 89/100] START alpha=89.0..............................................
[CV 8/10; 89/100] END ..............alpha=89.0;, score=-1.564 total time=   0.0s
[CV 1/10; 91/100] START alpha=91.0..............................................
[CV 1/10; 91/100] END ..............alpha=91.0;, score=-1.750 total time=   0.0s
[CV 2/10; 91/100] START alph

[CV 10/10; 50/100] START alpha=50.0.............................................
[CV 10/10; 50/100] END .............alpha=50.0;, score=-2.239 total time=   0.0s
[CV 1/10; 51/100] START alpha=51.0..............................................
[CV 1/10; 51/100] END ..............alpha=51.0;, score=-2.531 total time=   0.0s
[CV 2/10; 51/100] START alpha=51.0..............................................
[CV 2/10; 51/100] END ..............alpha=51.0;, score=-2.323 total time=   0.0s
[CV 3/10; 51/100] START alpha=51.0..............................................
[CV 3/10; 51/100] END ..............alpha=51.0;, score=-2.404 total time=   0.0s
[CV 4/10; 51/100] START alpha=51.0..............................................
[CV 4/10; 51/100] END ..............alpha=51.0;, score=-2.243 total time=   0.0s
[CV 5/10; 51/100] START alpha=51.0..............................................
[CV 5/10; 51/100] END ..............alpha=51.0;, score=-2.174 total time=   0.0s
[CV 6/10; 51/100] START alph

In [35]:
mt.report(gs_lasso.cv_results_,5)

Model with rank: 1
Mean validation score: -1.600141 (std: 0.121669)
Parameters: {'alpha': 0.020202020202020204}

Model with rank: 2
Mean validation score: -1.609023 (std: 0.127732)
Parameters: {'alpha': 0.04040404040404041}

Model with rank: 3
Mean validation score: -1.613882 (std: 0.120412)
Parameters: {'alpha': 0.0}

Model with rank: 4
Mean validation score: -1.615920 (std: 0.128869)
Parameters: {'alpha': 0.06060606060606061}

Model with rank: 5
Mean validation score: -1.620400 (std: 0.129631)
Parameters: {'alpha': 0.08080808080808081}



now we got a value inbetween the range, we will consider this as our final model for lasso

In [36]:
lasso_final=Lasso(**{'alpha': 0.020202020202020204})

In [37]:
lasso_final.fit(x_train,y_train)

In [38]:
ridge_wt_comparison['lasso_wts']=[lasso_final.intercept_]+list(lasso_final.coef_)

In [39]:
ridge_wt_comparison

Unnamed: 0,features,simple_model,ridge_wts,suppression_ratio_l2,lasso_wts
0,bias,74.233741,73.661457,1.007769,72.022329
1,Loan.Length_36 months,1.292842,-1.406586,-0.919135,-0.0
2,Loan.Length_60 months,4.459656,1.519287,2.93536,3.038974
3,Loan.Purpose_debt_consolidation,-1.496679,-0.328716,4.553106,-0.179589
4,Loan.Purpose_credit_card,-1.613132,-0.413548,3.900716,-0.229256
5,Loan.Purpose_other,-0.680605,0.376963,-1.805494,0.326845
6,Loan.Purpose_home_improvement,-1.416846,-0.187597,7.552624,-0.0
7,Loan.Purpose_major_purchase,-1.142537,0.004419,-258.564333,0.0
8,Loan.Purpose_small_business,-1.0028,0.102684,-9.765916,0.0
9,Loan.Purpose___other__,-0.732443,0.218014,-3.359616,0.0


you can see that $l_1$ penalty has made many weights exactly zero, to count how many weights have been made exactly zero

In [40]:
(lasso_final.coef_==0).sum()

34

we can actually remove those features and built the model without them 

# Data Prep for Logistic Regression

In [None]:
bd_train=pd.read_csv(r'./bd_train.csv')

In [None]:
def children_to_num(col):
    
    num_col=col.str.replace('Zero','0')
    num_col=num_col.str.replace('4+','4',regex=False)
    num_col=pd.to_numeric(num_col,errors='coerce')
    
    return num_col

def ab_to_num(col):
    
    col=col.str.replace('71+','71-71',regex=False)
    k=col.str.split('-',expand=True)
    
    for i in [0,1]:
        k[i]=pd.to_numeric(k[i],errors='coerce')
        
    num_col=0.5*(k[0]+k[1])
    
    return num_col

def fi_to_num(col):
    
    col=col.replace({'<10,000, >= 8,000':9000, '>=35,000':35000, '<25,000, >=22,500':23750,
       '<20,000, >=17,500':18750, '<12,500, >=10,000':11250, '<30,000, >=27,500':28750,
       '<27,500, >=25,000':26250, '<17,500, >=15,000':16250, '<15,000, >=12,500':13750,
       '<22,500, >=20,000':21250,'< 4,000': 4000, '< 8,000, >= 4,000':6000})
    num_col=pd.to_numeric(col,errors='coerce')
    
    return num_col

simple_numeric_cols=['year_last_moved','Average.Credit.Card.Transaction', 'Balance.Transfer',
      'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
      'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
      'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount','Investment.in.Commudity',
      'Investment.in.Equity', 'Investment.in.Derivative',
      'Portfolio.Balance']

cat_to_dummies_cols=['status' , 'occupation' , 'occupation_partner' , 'home_status', 'self_employed',
'self_employed_partner','TVarea','gender','region']

custom_function_cols={'children':children_to_num,'age_band':ab_to_num,'family_income':fi_to_num}

dp=mt.DataPipe(simple_num=simple_numeric_cols,
                     cat_to_dummies=cat_to_dummies_cols,
                     custom_func_dict=custom_function_cols)

dp.fit(bd_train)

x_train=dp.transform(bd_train)

y_train=(bd_train['Revenue.Grid']==1).astype(int)

## Logistic regression with gradient descent

you can use following functions to implement gradient descent version of parameter estimation for logistic regression, few things to keep in mind 

* standardize your data before using gradient descent with any optimizer you have in mind 
* you can `de-standardize` your estimates of weights thus obtained using the function `convert_to_non_standardized_weights` that we wrote earlier

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def mypred(x, w):
    """
    Logistic regression prediction (sigmoid function).
    """
    z = x @ w
    y_hat = sigmoid(z)
    return y_hat

def myerror(y, x, w):
    """
    Computes the prediction errors for logistic regression.
    """
    y_hat = mypred(x, w)
    errors = y - y_hat
    return errors

def mycost(y, x, w, alpha=0.1, penalty='none'):
    """
    Computes the cost function for logistic regression with either no penalty, L1 or L2 regularization.

    Parameters:
    - y: actual values
    - x: input data
    - w: weights
    - alpha: regularization strength
    - penalty: 'none' for no regularization, 'l1' for Lasso, or 'l2' for Ridge

    Returns:
    - cost: the regularized cost function value
    """
    m = x.shape[0]
    y_hat = mypred(x, w)

    # Logistic loss (cross-entropy loss)
    log_loss = -np.mean(y * np.log(y_hat + 1e-15) + (1 - y) * np.log(1 - y_hat + 1e-15))

    # Apply either L1 or L2 penalty, or no penalty
    if penalty == 'none':
        total_cost = log_loss
    elif penalty == 'l1':
        l1_penalty = np.sum(np.abs(w))
        total_cost = log_loss + alpha * l1_penalty / m
    elif penalty == 'l2':
        l2_penalty = np.sum(w ** 2)
        total_cost = log_loss + alpha * l2_penalty / (2 * m)
    else:
        raise ValueError("Invalid penalty type. Use 'none', 'l1', or 'l2'.")

    return total_cost

def gradient(y, x, w, alpha=0.1, penalty='none'):
    """
    Computes the gradient for logistic regression with either no penalty, L1 or L2 regularization.

    Parameters:
    - y: actual values
    - x: input data
    - w: weights
    - alpha: regularization strength
    - penalty: 'none' for no regularization, 'l1' for Lasso, or 'l2' for Ridge

    Returns:
    - grad: the gradient of the cost function with no regularization, L1, or L2 regularization
    """
    m = x.shape[0]
    y_hat = mypred(x, w)

    # Gradient of the logistic loss
    grad = -x.T @ (y - y_hat) / m

    # Apply regularization if specified
    if penalty == 'none':
        total_grad = grad
    elif penalty == 'l1':
        l1_grad = np.sign(w)  # Subgradient for L1
        total_grad = grad + alpha * l1_grad / m
    elif penalty == 'l2':
        l2_grad = 2 * w
        total_grad = grad + alpha * l2_grad / m
    else:
        raise ValueError("Invalid penalty type. Use 'none', 'l1', or 'l2'.")

    return total_grad


## Logistic Regression with sklearn

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
params_logr={'class_weight':['balanced',None],
       'penalty':['l1','l2'] ,# this is not 11 eleven, its L1 [el-one] in lower case 
        'C':[.0001,.0005,.001,.005,.01,.05,0.1,2,5,10]}

In [None]:
logr=LogisticRegression(solver='liblinear')

In [None]:
gs_logr=GridSearchCV(logr,
               param_grid=params_logr,
               scoring='roc_auc', # scoring here is roc_auc for its a binary classification problem
               cv=10,
               n_jobs=-1,
               verbose=20)

In [None]:
gs_logr.fit(x_train,y_train)

In [None]:
mt.report(gs_logr.cv_results_,5)

In [None]:
logr_final=LogisticRegression(solver='liblinear',**{'C': 0.05, 'class_weight': 'balanced', 'penalty': 'l1'})

In [None]:
logr_final.fit(x_train,y_train)

## predict probabilities

In [None]:
# by default fitted model predicts probabilities for all the classes, and use the function predict_proba

In [None]:
logr_final.predict_proba(x_train)

In [None]:
# in order to understand which set of probabilities belong to what class look at this attribute
logr_final.classes_

In [None]:
# first probability belongs to class 0 and second to class 1 for each obs

In [None]:
class_1_probs=logr_final.predict_proba(x_train)[:,1]

## Predicting Hard Classes

In [None]:
# to go to hard classes from these probabilities we need to find a proper threshold on the probabilities 
# lets find cutoff on the basis of KS , you can replicate the same with F_1 score also

In [None]:
real=y_train
score=logr_final.predict_proba(x_train)[:,1]

In [None]:
cutoffs=np.linspace(0.001,0.999,999)

In [None]:
# we will calculate TP,TN,FP,FN for each cutoff and find corresponding KS value
# we select the ideal cutoff for which ks is maximum
# if there are multiple winners , we will simply go with the first one
all_ks=[]

for cutoff in cutoffs:
    
    # note that for each cutoff hard class predictions can be different
    
    predicted=(score>cutoff).astype(int) # this converts the True/False to 1/0
    
    TP=((real==1)&(predicted==1)).sum()
    FP=((real==0)&(predicted==1)).sum()
    TN=((real==0)&(predicted==0)).sum()
    FN=((real==1)&(predicted==0)).sum()
    
    P=TP+FN
    N=TN+FP
    
    ks=(TP/P)-(FP/N)
    
    all_ks.append(ks)

In [None]:
max(all_ks)

In [None]:
selected_cutoff=cutoffs[all_ks==max(all_ks)][0]

In [None]:
hard_class_preds=(logr_final.predict_proba(x_train)[:,1]>selected_cutoff).astype(int)

In [None]:
hard_class_preds