# Imports

In [26]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RandomizedLasso
from sklearn.datasets import load_boston
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Prep Data

In [27]:
data = pd.read_csv('datasets/data-titanic.csv')
data = data.drop(['name', 'ticket', 'cabin', 'body', 'boat', 'home.dest'], axis=1)
data = data.dropna()
from sklearn import preprocessing
encoded_data = data.copy()
le = preprocessing.LabelEncoder()
encoded_data.sex = le.fit_transform(encoded_data.sex)
encoded_data.embarked = le.fit_transform(encoded_data.embarked)
features = encoded_data.drop(['survived'], axis=1).values
labels = encoded_data['survived'].values

# Performance with All Features

In [28]:
lin_reg = LinearRegression()

In [29]:
-cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error')

array([0.16413987, 0.13907219, 0.12241864, 0.12645509, 0.11539907,
       0.16935641, 0.16038081, 0.19085267, 0.18679015, 0.16801512])

In [30]:
np.sqrt(-cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error'))

array([0.40514179, 0.37292384, 0.34988376, 0.35560525, 0.33970439,
       0.41152936, 0.40047574, 0.43686688, 0.43219226, 0.40989647])

In [31]:
# error
np.sqrt(-cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error')).mean()

0.3914219735723462

# Isolating Columns

In [32]:
features2 = encoded_data[['sex', 'pclass']].values

In [33]:
# error
np.sqrt(-cross_val_score(lin_reg, features2, labels, cv=10, scoring='neg_mean_squared_error')).mean()

0.39855172858512533

# Recursive Feature Elimination

In [34]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=1)
rfe.fit(features,labels)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
  n_features_to_select=1, step=1, verbose=0)

In [39]:
names = encoded_data.columns.values
names = np.delete(names,1)

In [40]:
sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))

[(1, 'sex'),
 (2, 'pclass'),
 (3, 'embarked'),
 (4, 'sibsp'),
 (5, 'parch'),
 (6, 'age'),
 (7, 'fare')]

# Randomized Lasso Algo

In [41]:
boston = load_boston()
features = boston["data"]
labels = boston["target"]

In [42]:
model = RandomizedLasso(alpha=0.025)
model.fit(features, labels)



RandomizedLasso(alpha=0.025, eps=2.220446049250313e-16, fit_intercept=True,
        max_iter=500, memory=None, n_jobs=None, n_resampling=200,
        normalize=True, pre_dispatch='3*n_jobs', precompute='auto',
        random_state=None, sample_fraction=0.75, scaling=0.5,
        selection_threshold=0.25, verbose=False)

In [43]:
sorted(zip(map(lambda x: round(x, 4), model.scores_), 
                 boston["feature_names"]), reverse=True)

[(1.0, 'RM'),
 (1.0, 'PTRATIO'),
 (1.0, 'LSTAT'),
 (0.605, 'CHAS'),
 (0.56, 'B'),
 (0.36, 'CRIM'),
 (0.32, 'TAX'),
 (0.22, 'NOX'),
 (0.17, 'DIS'),
 (0.09, 'INDUS'),
 (0.035, 'ZN'),
 (0.015, 'AGE'),
 (0.01, 'RAD')]