In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, SelectPercentile , SelectFromModel
from sklearn.metrics import accuracy_score

In [6]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [7]:
titanic = sns.load_dataset('titanic')

In [8]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [10]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [11]:
titanic.drop(['age', 'deck'], axis=1, inplace=True)

In [12]:
titanic.dropna(inplace=True)

In [13]:
titanic.isnull().sum()

survived       0
pclass         0
sex            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [15]:
titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,0,0,8.05,S,Third,man,True,Southampton,no,True


In [16]:
data = titanic[['pclass', 'sex', 'sibsp', 'parch', 'embarked', 'who', 'alone']].copy()

In [17]:
data.head()

Unnamed: 0,pclass,sex,sibsp,parch,embarked,who,alone
0,3,male,1,0,S,man,False
1,1,female,1,0,C,woman,False
2,3,female,0,0,S,woman,True
3,1,female,1,0,S,woman,False
4,3,male,0,0,S,man,True


In [18]:
data.isnull().sum()

pclass      0
sex         0
sibsp       0
parch       0
embarked    0
who         0
alone       0
dtype: int64

In [19]:
sex = {'male':0, 'female':1}
data['sex'] = data['sex'].map(sex)

In [20]:
data.head()

Unnamed: 0,pclass,sex,sibsp,parch,embarked,who,alone
0,3,0,1,0,S,man,False
1,1,1,1,0,C,woman,False
2,3,1,0,0,S,woman,True
3,1,1,1,0,S,woman,False
4,3,0,0,0,S,man,True


In [21]:
ports = {'S': 0, 'C': 1, 'Q': 2}
data['embarked'] = data['embarked'].map(ports)

In [22]:
who = {'man': 0, 'woman': 1, 'child': 2}
data['who'] = data['who'].map(who)

In [23]:
alone = {True: 1, False: 0}
data['alone'] = data['alone'].map(alone)

In [24]:
data.head()

Unnamed: 0,pclass,sex,sibsp,parch,embarked,who,alone
0,3,0,1,0,0,0,0
1,1,1,1,0,1,1,0
2,3,1,0,0,0,1,1
3,1,1,1,0,0,1,0
4,3,0,0,0,0,0,1


In [25]:
X= data.copy()
Y = titanic['survived']

In [26]:
X.shape, Y.shape

((889, 7), (889,))

In [27]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

**Estimation of coefficient of Linear Regression**

In [28]:
sel = SelectFromModel(LinearRegression())

In [29]:
sel.fit(x_train,y_train)

SelectFromModel(estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                           n_jobs=None, normalize=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [30]:
sel.get_support()

array([ True,  True, False, False, False,  True, False])

In [34]:
np.abs(sel.estimator_.coef_)

array([0.1465448 , 0.32116345, 0.08154868, 0.05642944, 0.06482235,
       0.24199831, 0.09299509])

In [33]:
np.mean(np.abs(sel.estimator_.coef_)) 

0.14364315926685434

In [35]:
#selector selects variables with coefficients higher than the mean

In [36]:
features = x_train.columns[sel.get_support()]

In [37]:
features

Index(['pclass', 'sex', 'who'], dtype='object')

In [38]:
x_train_reg = sel.transform(x_train)
x_test_reg = sel.transform(x_test)

In [46]:
x_train_reg.shape, x_test_reg.shape

((711, 3), (178, 3))

In [44]:
def run_random_forest(x_train, x_test, y_train, y_test):
  clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  print('Accuracy score on test set: ', accuracy_score(y_test, y_pred))

In [45]:
%%time
run_random_forest(x_train_reg, x_test_reg, y_train, y_test)

Accuracy score on test set:  0.7415730337078652
CPU times: user 222 ms, sys: 36.5 ms, total: 259 ms
Wall time: 353 ms


In [47]:
%%time
run_random_forest(x_train, x_test, y_train, y_test)

Accuracy score on test set:  0.7359550561797753
CPU times: user 249 ms, sys: 23.4 ms, total: 272 ms
Wall time: 347 ms


**Logistic Regression coefficient with L1 regularizer**

In [57]:
sel = SelectFromModel(LogisticRegression(penalty='l1', C=0.05, solver= 'liblinear'))

In [58]:
sel.fit(x_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=0.05, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l1',
                                             random_state=None,
                                             solver='liblinear', tol=0.0001,
                                             verbose=0, warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [63]:
sel.get_support()

array([ True,  True,  True, False,  True,  True, False])

In [61]:
np.abs(sel.estimator_.coef_) # non important feature coefficients are made 0 in L1 regularization.

array([[0.63176909, 1.11131242, 0.20117935, 0.        , 0.07340571,
        1.0185661 , 0.        ]])

In [64]:
x_train_l1 = sel.transform(x_train)
x_test_l1 = sel.transform(x_test)

In [65]:
%%time
run_random_forest(x_train_l1, x_test_l1, y_train, y_test)

Accuracy score on test set:  0.7415730337078652
CPU times: user 224 ms, sys: 34.6 ms, total: 259 ms
Wall time: 351 ms


**L2 Regularization**

In [66]:
sel = SelectFromModel(LogisticRegression(penalty='l2', C=0.05, solver= 'liblinear'))

In [67]:
sel.fit(x_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=0.05, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='auto',
                                             n_jobs=None, penalty='l2',
                                             random_state=None,
                                             solver='liblinear', tol=0.0001,
                                             verbose=0, warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [68]:
sel.get_support()

array([ True,  True, False, False, False,  True, False])

In [70]:
np.abs(sel.estimator_.coef_)

array([[0.6556907 , 1.03700649, 0.3412496 , 0.1066889 , 0.33206109,
        1.10766575, 0.23595513]])

In [71]:
x_train_l2 = sel.transform(x_train)
x_test_l2 = sel.transform(x_test)

In [72]:
%%time
run_random_forest(x_train_l2, x_test_l2, y_train, y_test)

Accuracy score on test set:  0.7415730337078652
CPU times: user 219 ms, sys: 29.4 ms, total: 249 ms
Wall time: 345 ms
