In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [3]:
#Linear regression finds the parameters w (where the line crosses 0) and b (the y axis offset eg how far it goes up for each unit on the x axis) that minimise the mean squared error.
X_train, x_test, Y_train, y_test = train_test_split(cancer["data"], cancer["target"], random_state=0)
lr = LinearRegression().fit(X_train, Y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, Y_train)))
print("Test set score: {:.2f}".format(lr.score(x_test, y_test)))

Training set score: 0.78
Test set score: 0.73


In [4]:
#This is used to constrain the linear model and to try and avoid overfitting. It does this by making all the magnitudes of coefficents close to 0. 
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, Y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, Y_train)))
print("Test set score: {:.2f}".format(ridge.score(x_test, y_test)))

Training set score: 0.75
Test set score: 0.72


In [5]:
#Alpha How much importance the model puts on the simplicity vs training performance is controlled by alpha. This forces th coeff. to move more towards 0
ridge2 = Ridge(alpha=10).fit(X_train, Y_train)
print("Training set score: {:.2f}".format(ridge2.score(X_train, Y_train)))
print("Test set score: {:.2f}".format(ridge2.score(x_test, y_test)))

Training set score: 0.73
Test set score: 0.71


In [6]:
#Lasso is an alt for ridge. It also restricts coeff to be close to 0. It means some features are 0 and therefore ignored. This is a form of auto selection
#You can adjust the max_iter to control how far coeff are pushed to 0. this improves the scores.
from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train, Y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, Y_train)))
print("Test set score: {:.2f}".format(lasso.score(x_test, y_test)))
print("Number of features used", np.sum(lasso.coef_ !=0))

lasso = Lasso(alpha=0.01, max_iter=100000).fit(X_train, Y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, Y_train)))
print("Test set score: {:.2f}".format(lasso.score(x_test, y_test)))
print("Number of features used", np.sum(lasso.coef_ !=0))
#Having a lower alpha allows us to fit a more complex model and increases the number of features used.

Training set score: 0.55
Test set score: 0.61
Number of features used 2
Training set score: 0.69
Test set score: 0.68
Number of features used 8
