In [1]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt 

In [2]:
class_path= "/Users/james/Documents/ADEC7430"
proj_path= os.path.join(class_path, "LectureNotes")
raw_data_folder= os.path.join(class_path, "RawData")
saved_data_folder= os.path.join(class_path, "SavedData")
output_data_folder= os.path.join(class_path, "Output")

In [3]:
raw_train_data_file= os.path.join(raw_data_folder, "TitanicTrain.csv")
raw_train_data= pd.read_csv(raw_train_data_file)

In [4]:
import random
nrows= raw_train_data.shape[0]

In [5]:
random.seed(2019)
myrandoms= [random.uniform(0,1) for i in range(nrows)]

In [39]:
valid_filt= [x < .3 for x in myrandoms]
train_filt= [x>= .3 for x in myrandoms]

In [40]:
import collections 
collections.Counter(valid_filt)
#same as Razvan

Counter({False: 627, True: 264})

In [41]:
validation_data= raw_train_data[valid_filt]
train_data=raw_train_data[train_filt]
print(validation_data.shape)
print(train_data.shape)

(264, 12)
(627, 12)


In [42]:
validation_data_file= os.path.join(saved_data_folder, "validation_data.pkl")
validation_data.to_pickle(validation_data_file)
train_data_file= os.path.join(saved_data_folder, "train_data.pkl")
train_data.to_pickle(train_data_file)
del validation_data

In [43]:
train_data = pd.read_pickle(train_data_file)

In [47]:
def encode_embarked(df_):
    """df = pandas Data Frame with a variable named 'Embark' which we are encoding"""
    df = df_.copy()
    df['Emb_S'] = 0
    df['Emb_C'] = 0
    df['Emb_Q'] = 0
    df['Emb_O'] = 0
    filter_ = df.Embarked=="S"
    df.loc[filter_, 'Emb_S'] = 1
    filter_ = df.Embarked=="C"
    df.loc[filter_, 'Emb_C'] = 1
    filter_ = df.Embarked=="Q"
    df.loc[filter_, 'Emb_Q'] = 1
    filter_ = df.Embarked.isin(['S','C','Q'])
    filter_.value_counts(dropna=False)
    df.loc[~filter_, 'Emb_O'] = 1
    return(df)

In [44]:
def encode_sex(df_):
    df = df_.copy() 
    df['sex_m'] = 0
    df['sex_f'] = 0
    df['sex_o'] = 0
    filter_ = df.Sex=="male"
    df.loc[filter_,'sex_m'] = 1
    filter_ = df.Sex=="female"
    df.loc[filter_,'sex_f'] = 1
    filter_ = df.Sex.isin(['male','female'])
    df.loc[~filter_,'sex_o'] = 1
    return(df)

In [45]:
def fill_in_nan_age(df_, which_var = 'Age', training = False):
    df = df_.copy()
    fill_in_nan_age.use_for_missing = 42 
    if training:
        fill_in_nan_age.use_for_missing = 42
    filter_ = pd.isnull(df[which_var])
    df.loc[filter_,which_var] = fill_in_nan_age.use_for_missing
    return(df)

In [46]:
def trans_drop_vars(df_):
    df = df_.copy()
    df.drop(columns=['Fare','Cabin','Name','Ticket', 'Embarked','Sex',], inplace=True)
    return(df)

In [48]:
def transform_mydata(df_, training = False):
    df = df_.copy()
    train1 = encode_embarked(df)
    train2 = encode_sex(train1)
    train3 = trans_drop_vars(train2)
    train4 = fill_in_nan_age(train3, training = training) #@@ this is an issue for training vs validation - why???
    return(train4)

In [49]:
newdf_train= transform_mydata(train_data, training=True)
newdf_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Emb_S,Emb_C,Emb_Q,Emb_O,sex_m,sex_f,sex_o
0,1,0,3,22.0,1,0,1,0,0,0,1,0,0
1,2,1,1,38.0,1,0,0,1,0,0,0,1,0
2,3,1,3,26.0,0,0,1,0,0,0,0,1,0
4,5,0,3,35.0,0,0,1,0,0,0,1,0,0
5,6,0,3,42.0,0,0,0,0,1,0,1,0,0


In [50]:
print(newdf_train.shape)

(627, 13)


In [51]:
valid_data = pd.read_pickle(validation_data_file)
new_valid= transform_mydata(valid_data, training=False)
print(new_valid.shape)

(264, 13)


In [54]:
import sklearn
from sklearn import neighbors

In [55]:
nbrs= sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)

In [56]:
y_train= newdf_train['Survived']
x_train=newdf_train.copy().drop(columns=["PassengerId", "Survived"])
x_train_array = x_train.values

In [57]:
nbrs.fit(x_train_array, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform')

In [58]:
x_train_prob=nbrs.predict(x_train_array)

In [59]:
x_train_pred = [int(x < .33) for x in x_train_prob]

In [60]:
pd.crosstab(np.array(x_train_pred), np.array(y_train))

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,123,223
1,271,10


In [61]:
def confusionMatrixInfo(p,a, labels = None):
    from sklearn import metrics as skm
    from sklearn.metrics import confusion_matrix as skm_conf_mat
    import sys
    #
#    p = pd.Series([1,1,1,0,0,0,0,0,0,0])
#    a = pd.Series([1,0,0,1,1,1,0,0,0,0])
#    labels = [1,0]
#
#    x = skm.confusion_matrix(a,p,labels=labels)
    if 'sklearn' not in sys.modules:
        import sklearn
    x = skm_conf_mat(a,p, labels = labels)
    tp = x[0,0]
    tn = x[1,1]
    fp = x[1,0]
    fn = x[0,1]
    # tp, fp, fn, tn # test order
    
    tsensitivity = tp/(tp+fn)
    tspecificity = tn/(tn + fp)
    # no information rate?
    tnir = (tp + fn)/x.sum()
    tnir = max(tnir, 1-tnir)
    # accuracy
    taccuracy = (tp + tn)/x.sum()
    
    res = {'confusionMatrix':x,
           'accuracy': taccuracy,
           'no information rate': tnir,
           'sensitivity': tsensitivity,
           'specificity': tspecificity
           }
    return(res)

In [62]:
confusionMatrixInfo(np.array(x_train_pred), np.array(y_train))

{'confusionMatrix': array([[123, 271],
        [223,  10]]),
 'accuracy': 0.21212121212121213,
 'no information rate': 0.6283891547049442,
 'sensitivity': 0.31218274111675126,
 'specificity': 0.04291845493562232}

In [64]:
x_valid=new_valid.copy().drop(columns=["PassengerId", "Survived"])
y_valid=new_valid["Survived"]

In [65]:
x_valid2 = x_valid.values

In [66]:
nbrs.fit(x_valid2, y_valid)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform')

In [67]:
x_valid2_prob=nbrs.predict(x_valid2)

In [68]:
x_valid2_pred = [int(x < .33) for x in x_valid2_prob]

In [69]:
pd.crosstab(np.array(x_valid2_pred), np.array(y_valid))

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,48,105
1,107,4


In [70]:
confusionMatrixInfo(np.array(x_valid2_pred), np.array(y_valid))

{'confusionMatrix': array([[ 48, 107],
        [105,   4]]),
 'accuracy': 0.19696969696969696,
 'no information rate': 0.5871212121212122,
 'sensitivity': 0.3096774193548387,
 'specificity': 0.03669724770642202}