In [70]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier

df_train = pd.read_csv('../train.csv')
df_test = pd.read_csv('../test.csv')

df_train.loc[df_train['PassengerId'].isin([62,830]),'Embarked'] = 'C'

df_test.loc[df_test['PassengerId'] == 1044,'Fare'] = 13.6750

def title_to_num(title):
    if title == 'Master':
        return 1
    elif title == 'Miss':
        return 2
    elif title == 'Mr':
        return 3
    elif title == 'Mrs':
        return 4
    else:
        return 5
    

def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 39
        elif Pclass == 2:
            return 30
        else:
            return 25
    else:
        return Age
    
data = [df_train,df_test]

for df in data:
    
    df['Age'] =df[['Age','Pclass']].apply(impute_age,axis = 1)
    
    
    df['Sex'] = df['Sex'].map({'male':0,'female':1})
    
    df['Embarked'] = df['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)
    
    df.loc[df['Fare'] <= 7.91,'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454),'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <=  31),'Fare'] = 2 
    df.loc[df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)
    df.loc[df['Age'] <= 16,'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32),'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <=  48),'Age'] = 2 
    df.loc[df['Age'] > 48, 'Age'] = 3
    df['Age'] = df['Age'].astype(int)
    
    df['FamilySize'] = df['SibSp'] + df['Parch']+  1
    df['IsAlone'] = 0 
    df.loc[df['FamilySize'] == 1,'IsAlone'] = 1
    df['Name'] = df.Name.str.extract('([A-Za-z]+)\.',expand =False)
    df['Name'] = [title_to_num(i) for i in df['Name']]
    

    

In [71]:
df_train.drop(['Cabin','Ticket','SibSp','Parch'],axis = 1, inplace = True)
df_test.drop(['Cabin','Ticket','SibSp','Parch'],axis = 1, inplace = True)
print(df_train.info())
X_train = df_train.drop(['PassengerId','Survived'],axis=1)
Y_train = df_train['Survived']
X_test = df_test.drop('PassengerId',axis= 1).copy()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null int64
Sex            891 non-null int64
Age            891 non-null int64
Fare           891 non-null int64
Embarked       891 non-null int64
FamilySize     891 non-null int64
IsAlone        891 non-null int64
dtypes: int64(10)
memory usage: 69.7 KB
None


In [72]:

forest = RandomForestClassifier(random_state = 1)
forest.fit(X_train,Y_train)
Y_pred= forest.predict(X_test)
submission = pd.DataFrame({
    'PassengerId':df_test['PassengerId'],
    'Survived':Y_pred
})
submission.to_csv("Submit.csv",index = False)

In [73]:
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,Fare,Embarked,FamilySize,IsAlone
0,3,3,0,1,0,0,2,0
1,1,4,1,2,3,1,2,0
2,3,2,1,1,1,0,1,1
3,1,4,1,2,3,0,2,0
4,3,3,0,2,1,0,1,1


In [74]:
for i,k in zip(X_train.columns,forest.feature_importances_):
    print(i,round(k,4))

Pclass 0.1323
Name 0.2485
Sex 0.2155
Age 0.0939
Fare 0.1066
Embarked 0.0583
FamilySize 0.1343
IsAlone 0.0106


In [75]:
def calc(df_train,df_test):
    X_train = df_train.drop(['PassengerId','Survived'],axis=1)
    Y_train = df_train['Survived']
    X_test = df_test.drop('PassengerId',axis= 1).copy()
    forest = RandomForestClassifier(random_state = 1)
    forest.fit(X_train,Y_train)
    Y_pred= forest.predict(X_test)
    submission = pd.DataFrame({
        'PassengerId':df_test['PassengerId'],
        'Survived':Y_pred
    })
    submission.to_csv("Submit.csv",index = False)

In [76]:
#df_train.drop(['FamilySize','IsAlone','Embarked'],axis = 1,inplace=True)
df_train.drop('IsAlone',axis = 1, inplace=True)
df_test.drop('IsAlone',axis = 1, inplace=True)
#df_test.drop(['FamilySize','IsAlone','Embarked'],axis = 1,inplace=True)

In [80]:
    X_train = df_train.drop(['PassengerId','Survived'],axis=1)
    Y_train = df_train['Survived']
    X_test = df_test.drop('PassengerId',axis= 1).copy()
    forest = RandomForestClassifier(random_state = 1)
    forest.fit(X_train,Y_train)
    Y_pred= forest.predict(X_test)
    submission = pd.DataFrame({
        'PassengerId':df_test['PassengerId'],
        'Survived':Y_pred
    })
    submission.to_csv("Submit.csv",index = False)

In [78]:
for i,k in zip(X_train.columns,forest.feature_importances_):
    print(i,round(k,4))
    

Pclass 0.1453
Name 0.1912
Sex 0.2717
Age 0.0887
Fare 0.0975
Embarked 0.0542
FamilySize 0.1514


In [81]:
print(round(round(forest.score(X_train,Y_train)*100,2),2),'%')

87.54 %
