In [1]:
import pandas as pd
import numpy as np

In [2]:
filepath='D:/downloads/'
filename = 'train.csv'

In [3]:
df = pd.read_csv(filepath+filename)
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## 1. survived 컬럼을 그룹화 -> age 열 평균을 구하여 비교

In [5]:
sd_age_mean = df.groupby(['Survived'])['Age'].mean()
sd_age_mean.index=['Dead','Survived']
sd_age_mean

Dead        30.626179
Survived    28.343690
Name: Age, dtype: float64

## 2. fare의 평균(임계값) -> fare 열 값을 이진화

In [9]:
from sklearn.preprocessing import Binarizer as Bz
fare_mean = df['Fare'].mean()
bz = Bz(threshold=fare_mean).fit_transform(np.array(df['Fare']).reshape(-1,1))
df['Fare_bi']=bz
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_bi
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.0


## 3. embarked 컬럼값에 대해 각 승선항 별 승선인원수

In [11]:
df.groupby('Embarked').apply(lambda x:len(x))

Embarked
C    168
Q     77
S    644
dtype: int64

## 4~6. sibsp, parch, fare, 컬럼값에 대해 표준화

In [15]:
from sklearn.preprocessing import StandardScaler as SS
sib_mean,sib_std=df['SibSp'].mean(),df['SibSp'].std()
par_mean,par_std=df['Parch'].mean(),df['Parch'].std()
fare_mean,fare_std=df['Fare'].mean(),df['Fare'].std()

df['SibSp'] = SS().fit_transform(np.array(df['SibSp']).reshape(-1,1))
df['Parch'] = SS().fit_transform(np.array(df['Parch']).reshape(-1,1))
df['Fare']= SS().fit_transform(np.array(df['Fare']).reshape(-1,1))

df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_bi
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,0.432793,-0.473674,A/5 21171,-0.502445,,S,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,0.432793,-0.473674,PC 17599,0.786845,C85,C,1.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,-0.474545,-0.473674,STON/O2. 3101282,-0.488854,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,0.432793,-0.473674,113803,0.42073,C123,S,1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,-0.474545,-0.473674,373450,-0.486337,,S,0.0


## 7. pclass와 survived 간에 상관관계 및 해석

In [27]:
S_mean,S_std=df['Survived'].mean(),df['Survived'].std()
P_mean,P_std=df['Pclass'].mean(),df['Pclass'].std()
pearson_series=((df['Survived']-S_mean)*(df['Pclass']-P_mean)/((df.shape[0]-1)*S_std*P_std))
print(f'Pearson Constant : {pearson_series.sum()}')

Pearson Constant : -0.33848103596101364


## 8. sibsp : 1, parch:2, fare:50인 승객이 있다. 이 승객은 살았을까 죽었을까?  
```
해결방법 예시) 3차원 공간(sibsp, parch, fare)에서 (1,2,50)의 위치와 가장  
  
가까운 위치에 있는 데이터 11건을 검색한다.  
  
-> 검색된 11건의 데이터에 대한 생존여부(survived)를 추출한다  
  
-> 다수결로 생존여부를 판단한다  
```

In [28]:
# standarize
vector_in = np.array([1,2,50])
means = np.array([sib_mean,par_mean,fare_mean])
stds = np.array([sib_std,par_std,fare_std])
vector_in = (vector_in-means)/stds
vector_in

array([0.43255043, 2.0078057 , 0.35811158])

In [34]:
def cal_dist(element,piv=vector_in):
    ret = piv-element[['SibSp','Parch','Fare']]
    ret = (ret*ret).sum()
    return ret**0.5

df['Dist'] = df.apply(cal_dist,axis=1)
top_11 = df.sort_values(by='Dist').iloc[:12]
top_11

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_bi,Dist
608,609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise La...",female,22.0,0.432793,2.008933,SC/Paris 2123,0.188763,,C,1.0,0.169353
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,0.432793,2.008933,SC/Paris 2123,0.188763,,C,1.0,0.169353
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,0.432793,2.008933,SC/Paris 2123,0.188763,,C,1.0,0.169353
754,755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48.0,0.432793,2.008933,220845,0.660333,,S,1.0,0.302224
615,616,1,2,"Herman, Miss. Alice",female,24.0,0.432793,2.008933,220845,0.660333,,S,1.0,0.302224
58,59,1,2,"West, Miss. Constance Mirium",female,5.0,0.432793,2.008933,C.A. 34651,-0.089684,,S,0.0,0.447797
472,473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33.0,0.432793,2.008933,C.A. 34651,-0.089684,,S,0.0,0.447797
450,451,0,2,"West, Mr. Edwy Arthur",male,36.0,0.432793,2.008933,C.A. 34651,-0.089684,,S,0.0,0.447797
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,0.432793,2.008933,W./C. 6607,-0.176263,,S,0.0,0.534376
783,784,0,3,"Johnston, Mr. Andrew G",male,,0.432793,2.008933,W./C. 6607,-0.176263,,S,0.0,0.534376


In [36]:
top_11_group = top_11.groupby(['Survived']).apply(lambda x:len(x))
top_11_group.index=['Dead','Survived']
top_11_group

Dead        5
Survived    7
dtype: int64

In [38]:
print(f'Expected status from input:{vector_in} =>' , end=' ')
print('Survived' if top_11_group.loc['Survived']>top_11_group.loc['Dead'] else 'Dead')

Expected status from input:[0.43255043 2.0078057  0.35811158] => Survived
