## import necessary modules

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

## import dataset

In [9]:
dataset=pd.read_csv('train.csv')

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [13]:
X,y=dataset.drop(['Survived'],axis=1),dataset['Survived']

In [15]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [16]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Check for Categorical Attributes

In [25]:
print(X['Pclass'].value_counts())

3    491
1    216
2    184
Name: Pclass, dtype: int64


In [26]:
print(X['Age'].value_counts())

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88, dtype: int64


In [27]:
print(X['Cabin'].value_counts())

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64


In [28]:
print(X['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


## Custom Transformer for numerical

In [22]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names]
    

In [23]:
num_pipeline=Pipeline([
    ('select_num',DataFrameSelector(['Age','Parch','Fare','SibSp'])),
    ('imputer',SimpleImputer(strategy='median'))
])

In [24]:
num_pipeline.fit_transform(X)

array([[22.    ,  0.    ,  7.25  ,  1.    ],
       [38.    ,  0.    , 71.2833,  1.    ],
       [26.    ,  0.    ,  7.925 ,  0.    ],
       ...,
       [28.    ,  2.    , 23.45  ,  1.    ],
       [26.    ,  0.    , 30.    ,  0.    ],
       [32.    ,  0.    ,  7.75  ,  0.    ]])

## Custom Transformer for String Categorical

In [31]:
class MostFrequentImputer(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        self.most_frequent=pd.Series([X[c].value_counts().index[0]for c in X],index=X.columns)
        return self
    def transform(self,X,y=None):
        return X.fillna(self.most_frequent)

In [33]:
cat_pipeline=Pipeline([
    ("select_category",DataFrameSelector(["Pclass","Sex","Embarked"])),
    ("imputer",MostFrequentImputer()),
    ("category_encoder",OneHotEncoder(sparse=False))
])

In [34]:
cat_pipeline.fit_transform(X)

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

## Feature Union

In [36]:
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])


In [37]:
X_train=preprocess_pipeline.fit_transform(X)

In [38]:
X_train

array([[22.    ,  0.    ,  7.25  , ...,  0.    ,  0.    ,  1.    ],
       [38.    ,  0.    , 71.2833, ...,  1.    ,  0.    ,  0.    ],
       [26.    ,  0.    ,  7.925 , ...,  0.    ,  0.    ,  1.    ],
       ...,
       [28.    ,  2.    , 23.45  , ...,  0.    ,  0.    ,  1.    ],
       [26.    ,  0.    , 30.    , ...,  1.    ,  0.    ,  0.    ],
       [32.    ,  0.    ,  7.75  , ...,  0.    ,  1.    ,  0.    ]])

## Train SVM

In [40]:
svm_clf=SVC(gamma='auto')

In [41]:
svm_clf.fit(X_train,y)

SVC(gamma='auto')

## cross val score of training data

In [44]:
cross_val_score(svm_clf,X_train,y,cv=3).mean()

0.7059483726150392

## testing time!

In [45]:
X_test=pd.read_csv('test.csv')

In [46]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [53]:
X_test=preprocess_pipeline.fit_transform(X_test)

In [56]:
y_test=pd.DataFrame()

In [59]:
y_test["PassengerId"]=pd.read_csv('test.csv')["PassengerId"]

In [60]:
y_test.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [61]:
y_test['Survived']=svm_clf.predict(X_test)

In [67]:
y_test.to_csv('results.csv',index=False)