In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [9]:
# Load database
cancer = load_breast_cancer()

# Display data base features
print (cancer.keys())

# Create data frame
cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
print (cancer_df.head(3))

# Construct train and test data sets
X = cancer.data
Y = cancer.target
print (Y[1:100])

X_train, X_test, y_train, y_test = train_test_split(
                                    X,Y, test_size=0.3, random_state=31,
stratify=Y)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38           122.8     1001.0          0.11840   
1        20.57         17.77           132.9     1326.0          0.08474   
2        19.69         21.25           130.0     1203.0          0.10960   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   

   mean fractal dimension           ...             worst radius  \
0                 0.07871           ...                    25.38   
1                 0.05667           ...                    24.99   
2                 0.05999           ...                    23.57   

   worst texture  worst perimeter  worst area  wo

In [10]:
# Define function, input: lamba, train model and output scores for train and test dataset
def do_lasso(lambdaaa):
    lasso = Lasso( alpha=lambdaaa, max_iter=10e5 )
    lasso.fit(X_train,y_train)
    train_score=lasso.score(X_train,y_train)
    test_score=lasso.score(X_test,y_test)
    coeff_used = np.sum(lasso.coef_!=0)

    print("Lambda=", lambdaaa, ":")
    print ("Training score:", train_score)
    print ("Test     score:", test_score)
    print ("#Features:     ", coeff_used)
    print( "\n")

In [11]:
## LASSO Regularization

# Loop for lambda (11 values)
for i in range (0,15):
    do_lasso(10**(-i*0.5))

# Linear Regression (lambda=0)
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_train_score=lr.score(X_train,y_train)
lr_test_score=lr.score(X_test,y_test)
print ("LR training score:", lr_train_score )
print ("LR test score: ", lr_test_score)

Lambda= 1.0 :
Training score: 0.5641045943314438
Test     score: 0.4875247154208508
#Features:      2


Lambda= 0.31622776601683794 :
Training score: 0.6466836447671205
Test     score: 0.6248766430637098
#Features:      4


Lambda= 0.1 :
Training score: 0.6638334047415441
Test     score: 0.6619774676548973
#Features:      5


Lambda= 0.03162277660168379 :
Training score: 0.6674404498331696
Test     score: 0.6727321797138059
#Features:      6


Lambda= 0.01 :
Training score: 0.6846160179073152
Test     score: 0.6941920773509355
#Features:      9


Lambda= 0.0031622776601683794 :
Training score: 0.7203693689452647
Test     score: 0.7269029381880414
#Features:      10


Lambda= 0.001 :
Training score: 0.7427775432506563
Test     score: 0.7351175238496964
#Features:      14


Lambda= 0.00031622776601683794 :
Training score: 0.7616450344847355
Test     score: 0.7347585856770997
#Features:      18


Lambda= 0.0001 :
Training score: 0.7708802919935065
Test     score: 0.7386227420124478
#Featu