Logistic Regression

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['figure.figsize'] = (20.0, 10.0)

# Load the data into a `pandas` DataFrame object
tr_path = './train5.csv'
titanic_df = pd.read_csv(tr_path)

# Examine head of df
titanic_df.head(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [2]:
### GRADED
### Define `prepare_data` using the instructions given above
### YOUR ANSWER BELOW

def prepare_data(input_x, target_y):
   
    if input_x.shape[0] < input_x.shape[1]:
            input_x = np.transpose(input_x)

    ones=np.ones((input_x.shape[0], 1),dtype=int)
    prepared_x=np.concatenate((ones,input_x),axis=1)
    initial_w = np.zeros(prepared_x.shape[1])
    target_y[target_y == 0] = -1
   
    return prepared_x, target_y, initial_w


Sigmoid $$\sigma_i(y_i \cdot w) = \frac{e^{y_iX_i^Tw}}{1+e^{y_ix_i^Tw}}$$  

In [3]:
def sigmoid_single(x, y, w):
    exponent=y*np.matmul(x.T,w)
    if exponent > 709.782:
        return 1
    else:
        exp=np.exp(exponent)
    return exp/(1+exp)

 $(1-\sigma_i(y_i\cdot w))y_ix_i$

In [4]:
def to_sum(x,y,w):
    return (1-sigmoid_single(x, y, w))*y*x

gradient of log-likelihood
$\sum_{i = 1}^n (1 − \sigma_i(y_i \cdot w))\ y_i x_i$


In [5]:
def sum_all(x_input, y_target, w):
    grad=np.zeros(len(w))
    for x,y in zip(x_input, y_target):
        grad += to_sum(x,y,w)
    return grad


In [6]:
#Toy example
x = np.array([[1,22,7.25],[1,38,71.2833]])
y = np.array([-1,1])
w = np.array([.1,-.2, .5])
print(sum_all(x,y,w))



[-0.33737816 -7.42231958 -2.44599168]


$$w_t + \eta \sum_{i = 1}^n (1 − \sigma_i(y_i \cdot w_i))\ y_i x_i$$

In [7]:
def update_w(x_input, y_target, w, eta):
    return w+(eta*sum_all(x_input, y_target,w))

$$w_{t+1} = w_t + \eta \sum_{i = 1}^n (1 − \sigma_i(y_i \cdot w_i))\ y_i x_i$$

In [8]:
def fixed_iteration(x_input, y_target, eta, steps):
    x_input, y_target, w = prepare_data(x_input,y_target)
    for i in range(steps): 
        w=update_w(x_input, y_target, w, eta)
    return w

In [9]:
x = np.array([[22,7.25],[38,71.2833],[26,7.925],[35,53.1]])
y = np.array([-1,1,1,1])
eta = .1
steps = 100

print(fixed_iteration(x,y, eta, steps))

[-0.9742495  -0.41389924  6.8199374 ]


In [10]:
def predict(x_input, weights):
    x_input=np.insert(x_input,0,1)
    prod=np.matmul(x_input,weights)
    if prod<0:
        return -1
    else:
        return 1
    

In [11]:
Xs = np.array([[22,7.25],[38,71.2833],[26,7.925],[35,53.1]])
weights = np.array([0,1,-1])
for X in Xs:

    print(predict(X,weights))

1
-1
1
-1


Application to real data set

In [12]:
### Drop irrelevant features from the dataframe
titanic_df.drop(['Ticket','Cabin', 'PassengerId', 'Name'], axis=1, inplace=True)
titanic_df = titanic_df.loc[titanic_df['Embarked'].notnull(),:]

### Drop "Survived" for purposes of KNN imputation:
y_target = titanic_df.Survived
titanic_knn = titanic_df.drop(['Survived'], axis = 1)  
titanic_knn.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [13]:
### Adding dummy variables for categorical variables
to_dummy = ['Sex','Embarked']
titanic_knn = pd.get_dummies(titanic_knn, prefix = to_dummy, columns = to_dummy, drop_first = True)

titanic_knn.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,3,35.0,0,0,8.05,1,0,1


Using KNN to impute missing ages

In [14]:
### Splitting data - on whether or not "Age" is specified. provided

# Training data -- "Age" Not null; "Age" as target
train = titanic_knn[titanic_knn.Age.notnull()]
X_train = train.drop(['Age'], axis = 1)
y_train = train.Age


# Data to impute, -- Where Age is null; Remove completely-null "Age" column.
impute = titanic_knn[titanic_knn.Age.isnull()].drop(['Age'], axis = 1)
print("Data to Impute")
print(impute.head(3))

# import algorithm
from sklearn.neighbors import KNeighborsRegressor

# Instantiate
knr = KNeighborsRegressor()

# Fit
knr.fit(X_train, y_train)

# Create Predictions
imputed_ages = knr.predict(impute)

# Add to Df
impute['Age'] = imputed_ages
print("\nImputed Ages")
print(impute.head(3))

# Re-combine dataframes
titanic_imputed = pd.concat([train, impute], sort = False, axis = 0)

# Return to original order - to match back up with "Survived"
titanic_imputed.sort_index(inplace = True)
print("Shape before imputation:", titanic_knn.shape)
print("Shape with imputed values:", titanic_imputed.shape)
titanic_imputed.head(7)

Data to Impute
    Pclass  SibSp  Parch     Fare  Sex_male  Embarked_Q  Embarked_S
5        3      0      0   8.4583         1           1           0
17       2      0      0  13.0000         1           0           1
19       3      0      0   7.2250         0           0           0

Imputed Ages
    Pclass  SibSp  Parch     Fare  Sex_male  Embarked_Q  Embarked_S   Age
5        3      0      0   8.4583         1           1           0  47.2
17       2      0      0  13.0000         1           0           1  25.6
19       3      0      0   7.2250         0           0           0  23.0
Shape before imputation: (889, 8)
Shape with imputed values: (889, 8)


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,3,35.0,0,0,8.05,1,0,1
5,3,47.2,0,0,8.4583,1,1,0
6,1,54.0,0,0,51.8625,1,0,1


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
lr = LogisticRegression()

lr.fit(titanic_imputed, y_target)

# Create sklearn's predictions
sk_pred = lr.predict(titanic_imputed)

print(lr.intercept_)
print(lr.coef_)

[4.07454295]
[[-0.88163975 -0.03085442 -0.29522779 -0.0771424   0.00431432 -2.40900801
   0.08754859 -0.21682785]]


In [16]:
#provided
#%%time
# This cell may take awhile
wt = fixed_iteration(titanic_imputed.values, y_target.values, .05, 12000)

print(wt)

cust_preds = np.array([predict(x,wt) for x in titanic_imputed.values])
cust_preds[cust_preds == -1] = 0

[  6853.75182849   -831.23291018   -304.24383135  -2919.42190116
  -1291.68658091    218.83793151 -14297.98042097     66.49415552
    428.14909561]


In [17]:
print("sklearn:")
print(classification_report(y_target, sk_pred))

print("Custom:")
print(classification_report(y_target, cust_preds))

sklearn:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       549
           0       0.00      0.00      0.00         0
           1       0.77      0.69      0.73       340

    accuracy                           0.26       889
   macro avg       0.26      0.23      0.24       889
weighted avg       0.30      0.26      0.28       889

Custom:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       549
           0       0.00      0.00      0.00         0
           1       0.82      0.41      0.54       340

    accuracy                           0.16       889
   macro avg       0.27      0.14      0.18       889
weighted avg       0.31      0.16      0.21       889

