In [22]:
from sklearn import preprocessing
import pandas as pd
import numpy as np

def prepare_feature(dataframe):
    selected_columns_df = dataframe[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

    features_engineering_df = selected_columns_df.replace({'Sex': {'male': 1, 'female': 0}})
    
    avg_age = features_engineering_df['Age'].mean(skipna=True)
    features_engineering_df['Age'].fillna(avg_age, inplace=True)
    
    avg_fare = features_engineering_df['Fare'].mean(skipna=True)
    features_engineering_df['Fare'].fillna(avg_fare, inplace=True)

    x = features_engineering_df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    
    return x_scaled

def prepare_label(dataframe):
    label_df = dataframe[['Survived']]
    y = label_df.values
    
    return y.ravel()
    
def prepare_train_data(file_name):
    df = pd.read_csv(file_name)
    
    return prepare_feature(df), prepare_label(df)

def prepare_test_data(file_name):
    df = pd.read_csv(file_name)
    
    return df['PassengerId'], prepare_feature(df)

In [2]:
from sklearn.linear_model import SGDClassifier

x_train, y_train = prepare_train_data('train.csv')

clf = SGDClassifier(loss='hinge', penalty='l2', max_iter=5)
clf.fit(x_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [23]:
passengerID, x_test = prepare_test_data('test.csv')

In [26]:
pred = clf.predict(x_test)

In [27]:
submission_df = pd.DataFrame(passengerID)
submission_df['Survived'] = pred

In [30]:
submission_df.to_csv('submission.csv', index=False)