In [44]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.svm import SVC

In [45]:
import pandas as pd
columns_to_be_added_as_features = ['Sex','Age','SibSp','Parch','Pclass','Fare','Embarked']
train_df = pd.read_csv('train.csv', usecols=columns_to_be_added_as_features + ['Survived'])
test_df = pd.read_csv('test.csv', usecols=columns_to_be_added_as_features + ['PassengerId'])
test_df = test_df[columns_to_be_added_as_features]

In [46]:
print(train_df.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S


In [47]:
print("Number of rows in training set: {}".format(len(train_df)))
print("Number of rows in test set: {}".format(len(test_df)))

Number of rows in training set: 891
Number of rows in test set: 418


In [48]:
for column_title in columns_to_be_added_as_features:
    if column_title in ['Embarked', "Sex"]:
        continue
    train_df[column_title] = pd.to_numeric(train_df[column_title], downcast="float")
    test_df[column_title] = pd.to_numeric(test_df[column_title], downcast="float")

train_df["Survived"] = pd.to_numeric(train_df["Survived"], downcast="float")

In [52]:
train_df['Embarked'] = train_df['Embarked'].replace({'Q': 0, 'S': 1, 'C': 2})
train_df['Sex'] = train_df['Sex'].replace({'male': 0, 'female': 1})

test_df['Embarked'] = test_df['Embarked'].replace({'Q': 0, 'S': 1, 'C': 2})
test_df['Sex'] = test_df['Sex'].replace({'male': 0, 'female': 1})

print(train_df.head())

   Survived  Pclass  Sex   Age  SibSp  Parch       Fare  Embarked
0       0.0     3.0    0  22.0    1.0    0.0   7.250000       1.0
1       1.0     1.0    1  38.0    1.0    0.0  71.283302       2.0
2       1.0     3.0    1  26.0    0.0    0.0   7.925000       1.0
3       1.0     1.0    1  35.0    1.0    0.0  53.099998       1.0
4       0.0     3.0    0  35.0    0.0    0.0   8.050000       1.0


In [53]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

train_df = normalize(train_df)
test_df = normalize(test_df)

print(train_df.head())

   Survived  Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
0       0.0     1.0  0.0  0.271174  0.125    0.0  0.014151       0.5
1       1.0     0.0  1.0  0.472229  0.125    0.0  0.139136       1.0
2       1.0     1.0  1.0  0.321438  0.000    0.0  0.015469       0.5
3       1.0     0.0  1.0  0.434531  0.125    0.0  0.103644       0.5
4       0.0     1.0  0.0  0.434531  0.000    0.0  0.015713       0.5


In [54]:
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [55]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [56]:
validation_set_ratio = 0.2
validation_set_size = int(len(train_df)*validation_set_ratio)
training_set_size = len(train_df) - validation_set_size

print("Total set size: {}".format(len(train_df)))
print("Training set size: {}".format(training_set_size))
print("Validation set size: {}".format(validation_set_size))

from sklearn.model_selection import train_test_split

train, val = train_test_split(train_df, test_size=validation_set_ratio)

train_X = train[columns_to_be_added_as_features]
train_Label = train[['Survived']]

val_X = val[columns_to_be_added_as_features]
val_Label = val[['Survived']]

Total set size: 891
Training set size: 713
Validation set size: 178


In [57]:
SVM_KERNEL = "linear"
SVM_C = 10
SVM_GAMMA = 0.00001

svm_model = SVC(kernel = SVM_KERNEL, C = SVM_C, gamma = SVM_GAMMA)
svm_model.fit(train_X, train_Label.values.ravel())

In [58]:
y_pred = svm_model.predict(val_X)
print("Accuracy:",metrics.accuracy_score(val_Label, y_pred))

Accuracy: 0.8324022346368715


In [59]:
test_pred = svm_model.predict(test_df)

In [60]:
result = pd.DataFrame(test_df_matcher['PassengerId'])
print(result.head(10))

   PassengerId
0          892
1          893
2          894
3          895
4          896
5          897
6          898
7          899
8          900
9          901


In [61]:
result.insert(1, "Survived", test_pred, True)
result["Survived"] = pd.to_numeric(result["Survived"], downcast="integer")
print(result.head(10))

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0


In [62]:
result.to_csv("out.csv", index=False)