설명

In [1]:
import pandas as pd
import numpy as np
filepath='D:/downloads/titanic/'

train = pd.read_csv(filepath+'train.csv')
test = pd.read_csv(filepath+'test.csv')
train_test_data = [train,test]

In [2]:
title_map = {
            'Mr':1,
            'Miss':2,
            'Mrs':3,
            'Master':4, 
}

def match_title(tag):
    try:
        return title_map[tag]
    except KeyError:
        return 5

for dataset in train_test_data:
        dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.')
        dataset.Title.replace(['Mlle','Ms'],'Miss',inplace=True)
        dataset.Title.replace(['Mme','Lady'],'Mrs',inplace=True)
        dataset.Title.replace(['Countess','Capt','Col','Don','Dr','Jonkheer','Major','Rev','Sir'],'Other',inplace=True)
        
        dataset.Title = dataset.Title.map(match_title)

train.Title.value_counts()

1    517
2    185
3    127
4     40
5     22
Name: Title, dtype: int64

In [3]:
for dataset in train_test_data:
    dataset.Sex = dataset.Sex.map({'female':1,'male':0})
train.Sex.value_counts()

0    577
1    314
Name: Sex, dtype: int64

In [4]:
for dataset in train_test_data:
    null_cnt = dataset.Embarked.isnull().sum()
    rand = np.random.choice(dataset.Embarked.value_counts().index,
                            size=null_cnt,
                            replace=True,
                            p=dataset.Embarked.value_counts(normalize=True))
    
    dataset.Embarked[dataset.Embarked.isnull()]=rand
    dataset.Embarked = dataset.Embarked.map({'S':0,'C':1,'Q':2})

train.Embarked.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.Embarked[dataset.Embarked.isnull()]=rand


0    644
1    169
2     78
Name: Embarked, dtype: int64

In [5]:
for dataset in train_test_data:
    dataset.Fare = dataset.Fare.fillna(dataset.Fare.median())
    dataset.Fare = pd.qcut(dataset.Fare,4).values.codes

train.Fare.value_counts()

1    224
0    223
3    222
2    222
Name: Fare, dtype: int64

In [6]:
for dataset in train_test_data:
    familysize = dataset.SibSp+dataset.Parch+1
    dataset['IsAlone']=familysize.apply(lambda x : 0 if x>1 else 1)
    
train.IsAlone.value_counts()

1    537
0    354
Name: IsAlone, dtype: int64

In [7]:
def match_randint(idx):
    global dataset,mas
    title = dataset.iloc[idx]['Title']
    return np.random.randint(mas.loc[title,'start'],mas.loc[title,'end'])

for dataset in train_test_data:
    mas = dataset.groupby(['Title'])['Age'].describe()[['mean','std']] # means and stds
    mas['start'] = mas['mean']-mas['std']
    mas['end'] = mas['mean']+mas['std']
    fill_array = pd.Series([match_randint(x) for x in dataset[dataset.Age.isnull()].index])
    dataset.Age = dataset.Age.fillna(fill_array)
    

In [8]:
for dataset in train_test_data:
    dataset.Age = pd.cut(dataset.Age,5).values.codes
    
train.Age.head(3)

0    1
1    2
2    1
Name: Age, dtype: int8

In [9]:
train['Cabin_category'] = train.Cabin.dropna().str.extract('([A-Z])')
test['Cabin_category'] = test.Cabin.dropna().str.extract('([A-Z])')
train['Cabin_category'].head(4)

0    NaN
1      C
2    NaN
3      C
Name: Cabin_category, dtype: object

In [10]:
def match_cabin(row):
    global cabin_df
    
    try:
        return np.random.choice(cabin_df.columns.values,1,p=cabin_df.loc[row.Pclass,row.Fare].values).item()
    except TypeError:
        return np.random.choice(cabin_df.columns.values,1).item()

for dataset in train_test_data:
    cabin_df = dataset.groupby(['Pclass','Fare'])['Cabin_category'].value_counts(normalize=True).unstack().fillna(0)
    null_set = dataset[dataset.Cabin.isnull()]
    dataset.Cabin_category = dataset.Cabin_category.fillna(null_set.apply(match_cabin,axis=1))

train.head(5)  # Cabin_category => NaN 값 채움

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,IsAlone,Cabin_category
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,0,,0,1,0,F
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,3,C85,1,3,0,C
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,1,,0,2,1,E
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,3,C123,0,3,0,C
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,1,,0,1,1,G


In [12]:
cabin_map = {'A':1,'B':2,'C':3,'D':4,
            'E':5,'F':6,'G':7,'T':8}

In [13]:
for dataset in train_test_data:
    dataset.Cabin = dataset.Cabin_category.map(cabin_map)
    
train.Cabin.head(5)

0    6
1    3
2    5
3    3
4    7
Name: Cabin, dtype: int64

In [15]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'IsAlone',
       'Cabin_category'],
      dtype='object')

In [16]:
features_drop=['Name','SibSp','Parch','Ticket','PassengerId','Cabin_category']
xTrain = train.drop(features_drop+['Survived'],axis=1)
yLabel = train['Survived']
xTest = test.drop(features_drop,axis=1)

In [17]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)
model = random_forest.fit(xTrain,yLabel)
prediction = model.predict(xTest)
score = model.score(xTrain,yLabel)

In [18]:
test['Survived'] = prediction

In [20]:
result = test[['PassengerId','Survived']]
result.to_csv('RF1.csv',index=False)
print('File Saved')
print(f'Model Accuracy : {model.score(xTrain,yLabel)}')

File Saved
Model Accuracy : 0.8956228956228957


In [23]:
random_forest10000 = RandomForestClassifier(n_estimators=10000)
model10000 = random_forest10000.fit(xTrain,yLabel)
prediction = model10000.predict(xTest)
test['Survived'] = prediction
result = test[['PassengerId','Survived']]
result.to_csv('RF3.csv',index=False)
print('File Saved')
print(f'Model Accuracy : {model10000.score(xTrain,yLabel)}')

File Saved
Model Accuracy : 0.8956228956228957


In [25]:
n_estimator_list = [11,31,51]
max_depth_list = [3,5,7]
min_split_list = [4,6,8]

for ne in n_estimator_list:
    for md in max_depth_list:
        for ms in min_split_list:
            rf = RandomForestClassifier(n_estimators=ne,max_depth=md,n_jobs=-1,min_samples_split=ms)
            model = rf.fit(xTrain,yLabel)
            prediction = model.predict(xTest)
            test['Survived'] = prediction
#             result = test[['PassengerId','Survived']]
#             result.to_csv('RF3.csv',index=False)
#             print('File Saved')
            print(f'Model Accuracy with {ne} , {md} , {ms} : {model.score(xTrain,yLabel)}')

Model Accuracy with 11 , 3 , 4 : 0.8125701459034792
Model Accuracy with 11 , 3 , 6 : 0.792368125701459
Model Accuracy with 11 , 3 , 8 : 0.7934904601571269
Model Accuracy with 11 , 5 , 4 : 0.835016835016835
Model Accuracy with 11 , 5 , 6 : 0.8237934904601572
Model Accuracy with 11 , 5 , 8 : 0.8305274971941639
Model Accuracy with 11 , 7 , 4 : 0.8608305274971941
Model Accuracy with 11 , 7 , 6 : 0.8507295173961841
Model Accuracy with 11 , 7 , 8 : 0.8597081930415263
Model Accuracy with 31 , 3 , 4 : 0.8092031425364759
Model Accuracy with 31 , 3 , 6 : 0.8170594837261503
Model Accuracy with 31 , 3 , 8 : 0.792368125701459
Model Accuracy with 31 , 5 , 4 : 0.8338945005611672
Model Accuracy with 31 , 5 , 6 : 0.8372615039281706
Model Accuracy with 31 , 5 , 8 : 0.8372615039281706
Model Accuracy with 31 , 7 , 4 : 0.8630751964085297
Model Accuracy with 31 , 7 , 6 : 0.8585858585858586
Model Accuracy with 31 , 7 , 8 : 0.8574635241301908
Model Accuracy with 51 , 3 , 4 : 0.819304152637486
Model Accuracy w

In [30]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators':[11,31,51],
         'max_depth':[3,5,7],
         'min_samples_split':[4,6,8]}

rfModel = RandomForestClassifier(random_state=924 , n_jobs=-1)
gridCV = GridSearchCV(rfModel , param_grid=params , cv=5 , n_jobs=-1)
gridCV.fit(xTrain,yLabel)

UnicodeEncodeError: 'ascii' codec can't encode characters in position 18-20: ordinal not in range(128)