### load stuff

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [2]:
# read data 

train_df = pd.get_dummies(pd.read_csv('train.csv').drop(['Name', 'Ticket', 'Cabin'], axis=1))
test_df = pd.get_dummies(pd.read_csv('test.csv').drop(['Name', 'Ticket', 'Cabin'], axis=1))

### Preprocessing / Cleaning

In [7]:
# impute missing data

val = train_df['Age'].mean()
train_df['Age'].fillna(val, inplace=True)
train_df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(val, inplace=True)


PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [8]:
# create training and target vectors

live = train_df['Survived']
train_df.drop('Survived', axis=1, inplace=True)

In [15]:
import scipy.stats as stats
train_df[(np.abs(stats.zscore(train_df)) < 3).all(axis=1)]

TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method

### LinearSVC

In [9]:
# train LinearSVC

clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(train_df, live)



In [10]:
# create predictions

val = test_df['Age'].mean()
test_df['Age'].fillna(val, inplace=True)

val = test_df['Fare'].mean()
test_df['Fare'].fillna(val, inplace=True)

predictions = clf.predict(test_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Fare'].fillna(val, inplace=True)


### Format for submission

In [11]:
predictions_df = pd.DataFrame({'PassengerID': test_df[['PassengerId']].values[:, 0],
                              'Survived': predictions})
predictions_df.shape
# should be (418, 2)

(418, 2)

In [12]:
# write to csv
predictions_df.to_csv("submission.csv", index=False)