In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [27]:
data = df.copy()

In [28]:
data['Sex'] = data['Sex'].map({'male':0, 'female':1})

In [10]:
import numpy as np

In [11]:
size=data['Age'].isnull().sum()
mean_age = data['Age'].mean()
std_age = data['Age'].std()

rand_data = np.random.normal(loc=mean_age, scale=std_age, size=size)

年齢が負のデータは符号を反転させる

In [25]:
rand_data = np.copysign(rand_data, 1)

In [14]:
data[data['Age'].isnull()].index

Int64Index([  5,  17,  19,  26,  28,  29,  31,  32,  36,  42,
            ...
            832, 837, 839, 846, 849, 859, 863, 868, 878, 888],
           dtype='int64', length=177)

In [29]:
rand_data_s = pd.Series(rand_data, index=data[data['Age'].isnull()].index)

In [30]:
data['Age'] = data['Age'].fillna(rand_data_s)

In [31]:
data['Age'].describe()

count    891.000000
mean      29.768111
std       14.355925
min        0.064919
25%       21.000000
50%       28.431079
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [33]:
data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.049319,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,0.543351,-0.042712,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.301505,0.083081,0.018443,-0.5495
Sex,-0.042939,0.543351,-0.1319,1.0,-0.082543,0.114631,0.245489,0.182333
Age,0.049319,-0.042712,-0.301505,-0.082543,1.0,-0.225433,-0.168596,0.087093
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.225433,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,0.245489,-0.168596,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.182333,0.087093,0.159651,0.216225,1.0


In [37]:
x = data[['Pclass', 'Sex', 'Age', 'Fare']]
y = data['Survived']

In [38]:
train_d, test_d, train_l, test_l = train_test_split(x, y, test_size=0.3)

In [39]:
C = [0.01, 0.1, 1, 5, 10, 50, 100]
gamma = [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
param_grid = [{'C':C, 'gamma':gamma}]

clf = GridSearchCV(SVC(), param_grid, cv=4)
clf.fit(train_d, train_l)

GridSearchCV(cv=4, estimator=SVC(),
             param_grid=[{'C': [0.01, 0.1, 1, 5, 10, 50, 100],
                          'gamma': [0.001, 0.01, 0.1, 0.5, 1, 5, 10]}])

In [40]:
print(f"Best score: {clf.best_score_}")
print(clf.best_params_)

Best score: 0.7769230769230768
{'C': 50, 'gamma': 0.001}


In [41]:
clf2 = SVC(C=50, gamma=0.001)
clf2.fit(train_d, train_l)
res = clf2.predict(test_d)

In [43]:
score = accuracy_score(res, test_l)
print(f"Score by tuning params: {score*100:.3f}%")

Score by tuning params: 79.104%
