In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('train.csv')[['PassengerId', 'Name', 'Survived']]
test_data = pd.read_csv('test.csv')[['PassengerId', 'Name']]

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Name,Survived
0,1,"Braund, Mr. Owen Harris",0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1
2,3,"Heikkinen, Miss. Laina",1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1
4,5,"Allen, Mr. William Henry",0


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Name
0,892,"Kelly, Mr. James"
1,893,"Wilkes, Mrs. James (Ellen Needs)"
2,894,"Myles, Mr. Thomas Francis"
3,895,"Wirz, Mr. Albert"
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"


In [5]:
#Extracting Surname and Title from Name col.
train_data['Title'], test_data['Title'] = (df['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0]) for df in [train_data, test_data])
train_data['Surname'], test_data['Surname'] = (df['Name'].apply(lambda x: x.split(',')[0]) for df in [train_data, test_data])

In [6]:
data = train_data.append(test_data)
data.groupby('Title')['PassengerId'].count()

Title
Capt.          1
Col.           4
Don.           1
Dona.          1
Dr.            8
Jonkheer.      1
Lady.          1
Major.         2
Master.       61
Miss.        260
Mlle.          2
Mme.           1
Mr.          757
Mrs.         197
Ms.            2
Rev.           8
Sir.           1
the            1
Name: PassengerId, dtype: int64

In [7]:
#We incorrectly extracted 'the' as a Title
data[data.Title=='the']

Unnamed: 0,PassengerId,Name,Survived,Title,Surname
759,760,"Rothes, the Countess. of (Lucy Noel Martha Dye...",1.0,the,Rothes


In [8]:
#Correcting the Title of PassengerId==760
train_data.loc[(train_data.PassengerId==760), 'Title'] = 'Countess.'
data.loc[(train_data.PassengerId==760), 'Title'] = 'Countess.'

In [9]:
#Creating a dictionary to map Title to relevant Category
TitleSexDict = {'Capt.': 'man', 'Don.': 'man', 'Major.': 'man', 'Col.': 'man', 'Rev.': 'man', 'Dr.': 'man', 'Sir.': 'man',
                'Mr.': 'man', 'Jonkheer': 'man', 'Dona.': 'woman', 'Countess.': 'woman', 'Mme.': 'woman', 'Mlle.': 'woman',
                'Ms.': 'woman', 'Miss.': 'woman', 'Lady.': 'woman', 'Mrs.': 'woman', 'Master.': 'boy'}

In [10]:
train_data['Category'], test_data['Category'], data['Category'] = (df.Title.map(TitleSexDict) for df in [train_data, test_data, data])

In [11]:
data.head()

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category
0,1,"Braund, Mr. Owen Harris",0.0,Mr.,Braund,man
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1.0,Mrs.,Cumings,woman
2,3,"Heikkinen, Miss. Laina",1.0,Miss.,Heikkinen,woman
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1.0,Mrs.,Futrelle,woman
4,5,"Allen, Mr. William Henry",0.0,Mr.,Allen,man


Now we have to identify woman-child-groups e.g., women and children with a common Surname. For that, we'll create a temporary dataset from combined data excluding men and then take the frequency of Surname in a dictionary.

In [12]:
temp = train_data[(train_data.Category!='man')]
temp.head()

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,Mrs.,Cumings,woman
2,3,"Heikkinen, Miss. Laina",1,Miss.,Heikkinen,woman
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,Mrs.,Futrelle,woman
7,8,"Palsson, Master. Gosta Leonard",0,Master.,Palsson,boy
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,Mrs.,Johnson,woman


In [13]:
SurnameDict = temp.groupby('Surname')['PassengerId'].count().to_dict()
SurnameDict

{'Abbott': 1,
 'Abelson': 1,
 'Ahlin': 1,
 'Aks': 1,
 'Allen': 1,
 'Allison': 3,
 'Andersen-Jensen': 1,
 'Andersson': 7,
 'Andrews': 1,
 'Angle': 1,
 'Appleton': 1,
 'Arnold-Franchi': 1,
 'Asplund': 4,
 'Astor': 1,
 'Attalah': 1,
 'Aubart': 1,
 'Ayoub': 1,
 'Backstrom': 1,
 'Baclini': 4,
 'Ball': 1,
 'Barbara': 2,
 'Barber': 1,
 'Baxter': 1,
 'Bazzani': 1,
 'Beane': 1,
 'Becker': 2,
 'Beckwith': 1,
 'Bidois': 1,
 'Bishop': 1,
 'Bissette': 1,
 'Bonnell': 1,
 'Boulos': 2,
 'Bourke': 2,
 'Bowerman': 1,
 'Brown': 3,
 'Burns': 1,
 'Buss': 1,
 'Bystrom': 1,
 'Cacic': 1,
 'Caldwell': 2,
 'Cameron': 1,
 'Canavan': 1,
 'Caram': 1,
 'Carr': 1,
 'Carter': 4,
 'Chambers': 1,
 'Cherry': 1,
 'Chibnall': 1,
 'Christy': 1,
 'Clarke': 1,
 'Cleaver': 1,
 'Collyer': 2,
 'Compton': 1,
 'Connolly': 1,
 'Coutts': 2,
 'Crosby': 1,
 'Cumings': 1,
 'Dahlberg': 1,
 'Danbom': 1,
 'Davies': 1,
 'Davis': 1,
 'Davison': 1,
 'Dean': 1,
 'Devaney': 1,
 'Dick': 1,
 'Dodge': 1,
 'Doling': 2,
 'Dowdell': 1,
 'Drew': 1,


In [14]:
temp[(temp.Surname=='Allison')] #seems to work fine

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category
297,298,"Allison, Miss. Helen Loraine",0,Miss.,Allison,woman
305,306,"Allison, Master. Hudson Trevor",1,Master.,Allison,boy
498,499,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,Mrs.,Allison,woman


In [15]:
temp['SurnameFreq'] = temp.Surname.map(SurnameDict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['SurnameFreq'] = temp.Surname.map(SurnameDict)


In [16]:
temp.head()

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,Mrs.,Cumings,woman,1
2,3,"Heikkinen, Miss. Laina",1,Miss.,Heikkinen,woman,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,Mrs.,Futrelle,woman,1
7,8,"Palsson, Master. Gosta Leonard",0,Master.,Palsson,boy,4
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,Mrs.,Johnson,woman,3


In [17]:
temp[(temp.Surname=='Palsson')]

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq
7,8,"Palsson, Master. Gosta Leonard",0,Master.,Palsson,boy,4
24,25,"Palsson, Miss. Torborg Danira",0,Miss.,Palsson,woman,4
374,375,"Palsson, Miss. Stina Viola",0,Miss.,Palsson,woman,4
567,568,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",0,Mrs.,Palsson,woman,4


In [18]:
train_data.loc[(train_data.Category!='man'), 'SurnameFreq'] = train_data.Surname.map(SurnameDict)

In [19]:
train_data.head()

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq
0,1,"Braund, Mr. Owen Harris",0,Mr.,Braund,man,
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,Mrs.,Cumings,woman,1.0
2,3,"Heikkinen, Miss. Laina",1,Miss.,Heikkinen,woman,1.0
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,Mrs.,Futrelle,woman,1.0
4,5,"Allen, Mr. William Henry",0,Mr.,Allen,man,


In [20]:
train_data[train_data.Surname=='Allen'] #our neat little trick worked

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq
4,5,"Allen, Mr. William Henry",0,Mr.,Allen,man,
730,731,"Allen, Miss. Elisabeth Walton",1,Miss.,Allen,woman,1.0


We now have a count of Surname occuring in woman and male child passengers. We can deduce whether a passenger belonged to a woman-child-group or not based on whether or not SurnameFreq>1 or =1.

In [21]:
train_data.loc[(train_data.SurnameFreq>1), 'Group'] = 1

In [22]:
#The following 142 passengers belong to woman-child groups
train_data[(train_data.Group==1)]

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq,Group
7,8,"Palsson, Master. Gosta Leonard",0,Master.,Palsson,boy,4.0,1.0
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,Mrs.,Johnson,woman,3.0,1.0
10,11,"Sandstrom, Miss. Marguerite Rut",1,Miss.,Sandstrom,woman,2.0,1.0
16,17,"Rice, Master. Eugene",0,Master.,Rice,boy,5.0,1.0
18,19,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",0,Mrs.,Vander Planke,woman,2.0,1.0
...,...,...,...,...,...,...,...,...
858,859,"Baclini, Mrs. Solomon (Latifa Qurban)",1,Mrs.,Baclini,woman,4.0,1.0
863,864,"Sage, Miss. Dorothy Edith ""Dolly""",0,Miss.,Sage,woman,4.0,1.0
869,870,"Johnson, Master. Harold Theodor",1,Master.,Johnson,boy,3.0,1.0
885,886,"Rice, Mrs. William (Margaret Norton)",0,Mrs.,Rice,woman,5.0,1.0


In [52]:
temp = train_data[(train_data.Group==1)]

In [56]:
dic = temp.groupby('Surname')['Survived'].mean().to_dict()

In [58]:
print(dic)

{'Allison': 0.3333333333333333, 'Andersson': 0.14285714285714285, 'Asplund': 0.75, 'Baclini': 1.0, 'Barbara': 0.0, 'Becker': 1.0, 'Boulos': 0.0, 'Bourke': 0.0, 'Brown': 1.0, 'Caldwell': 1.0, 'Carter': 0.75, 'Collyer': 1.0, 'Coutts': 1.0, 'Doling': 1.0, 'Ford': 0.0, 'Fortune': 1.0, 'Goldsmith': 1.0, 'Goodwin': 0.0, 'Graham': 1.0, 'Hamalainen': 1.0, 'Harper': 1.0, 'Hart': 1.0, 'Hays': 1.0, 'Herman': 1.0, 'Hippach': 1.0, 'Johnson': 1.0, 'Jussila': 0.0, 'Kelly': 1.0, 'Laroche': 1.0, 'Lefebre': 0.0, 'Mellinger': 1.0, 'Moor': 1.0, 'Moubarek': 1.0, 'Murphy': 1.0, 'Navratil': 1.0, 'Newell': 1.0, 'Nicola-Yarred': 1.0, 'Palsson': 0.0, 'Panula': 0.0, 'Peter': 1.0, 'Quick': 1.0, 'Rice': 0.0, 'Richards': 1.0, 'Ryerson': 1.0, 'Sage': 0.0, 'Sandstrom': 1.0, 'Skoog': 0.0, 'Strom': 0.0, 'Taussig': 1.0, 'Van Impe': 0.0, 'Vander Planke': 0.0, 'West': 1.0, 'Wick': 1.0, 'Zabour': 0.0}


In [59]:
temp['SurnameSurvival'] = temp.Surname.map(dic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['SurnameSurvival'] = temp.Surname.map(dic)


In [60]:
temp

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq,Group,SurnameSurvival,AdjustedSurvival,predict
7,8,"Palsson, Master. Gosta Leonard",0,Master.,Palsson,boy,4.0,1.0,0.0,0.0,0.0
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,Mrs.,Johnson,woman,3.0,1.0,1.0,1.0,1.0
10,11,"Sandstrom, Miss. Marguerite Rut",1,Miss.,Sandstrom,woman,2.0,1.0,1.0,1.0,1.0
16,17,"Rice, Master. Eugene",0,Master.,Rice,boy,5.0,1.0,0.0,0.0,0.0
18,19,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",0,Mrs.,Vander Planke,woman,2.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
858,859,"Baclini, Mrs. Solomon (Latifa Qurban)",1,Mrs.,Baclini,woman,4.0,1.0,1.0,1.0,1.0
863,864,"Sage, Miss. Dorothy Edith ""Dolly""",0,Miss.,Sage,woman,4.0,1.0,0.0,0.0,0.0
869,870,"Johnson, Master. Harold Theodor",1,Master.,Johnson,boy,3.0,1.0,1.0,1.0,1.0
885,886,"Rice, Mrs. William (Margaret Norton)",0,Mrs.,Rice,woman,5.0,1.0,0.0,0.0,0.0


In [23]:
survived = []
perished = []
mixed = []

for item in train_data['Surname']:
    if train_data[(train_data.Surname==item) & (train_data.Group==1)]['Survived'].mean() == 1 and item not in survived:
        survived.append(item)
    if train_data[(train_data.Surname==item) & (train_data.Group==1)]['Survived'].mean() == 0 and item not in perished:
        perished.append(item)
    if train_data[(train_data.Surname==item) & (train_data.Group==1)]['Survived'].mean() > 0 and train_data[(train_data.Surname==item) & (train_data.Group==1)]['Survived'].mean() < 1 and item not in mixed:
        mixed.append(item)
        
survived.sort()
perished.sort()
mixed.sort()

In [24]:
#These woman-child-groups all survived
print(survived)

['Baclini', 'Becker', 'Brown', 'Caldwell', 'Collyer', 'Coutts', 'Doling', 'Fortune', 'Goldsmith', 'Graham', 'Hamalainen', 'Harper', 'Hart', 'Hays', 'Herman', 'Hippach', 'Johnson', 'Kelly', 'Laroche', 'Mellinger', 'Moor', 'Moubarek', 'Murphy', 'Navratil', 'Newell', 'Nicola-Yarred', 'Peter', 'Quick', 'Richards', 'Ryerson', 'Sandstrom', 'Taussig', 'West', 'Wick']


In [25]:
#These woman-child-groups all perished
print(perished)

['Barbara', 'Boulos', 'Bourke', 'Ford', 'Goodwin', 'Jussila', 'Lefebre', 'Palsson', 'Panula', 'Rice', 'Sage', 'Skoog', 'Strom', 'Van Impe', 'Vander Planke', 'Zabour']


In [26]:
#These woman-child-groups had mixed survival
print(mixed)

['Allison', 'Andersson', 'Asplund', 'Carter']


The above code shows that 124 out of 142 "woman-child-group" passengers were part of "woman-child-groups" that either entirely lived or died. And 18 out of the 142 were part of "woman-child-groups" with mixed survival. Among these 18 passengers, 14 lived or died according to their "woman-child-groups'" average fate and 4 did not. Therefore among all 142 "woman-child-group" passengers, 97.2% = 138/142 lived or died according to their "woman-child-groups'" average fate. Our new engineered feature is a near perfect predictor! Wow!

In [27]:
for item in survived:
    train_data.loc[(train_data.Surname==item) & (train_data.Group==1), 'SurnameSurvival'] = 1

In [28]:
for item in perished:
    train_data.loc[(train_data.Surname==item) & (train_data.Group==1), 'SurnameSurvival'] = 0

In [29]:
for item in mixed:
    train_data.loc[(train_data.Surname==item) & (train_data.Group==1), 'SurnameSurvival'] = train_data.loc[(train_data.Surname==item) & (train_data.Group==1)]['Survived'].mean()

Improvise AdjustedSurvival by rounding the figures

In [30]:
#Adjust survival rates for use on training dataset
train_data['AdjustedSurvival'] = (train_data.SurnameSurvival * train_data.SurnameFreq - train_data.Survived) / (train_data.SurnameFreq-1)
# apply gender model plus new predictor to training set
train_data.predict = 0
train_data.loc[(train_data.Category=='woman'), 'predict'] = 1
train_data.loc[(train_data.Category!='woman'), 'predict'] = 0
train_data.loc[(train_data.Category=='boy') & (train_data.AdjustedSurvival==1), 'predict'] = 1
train_data.loc[(train_data.Category=='woman') & (train_data.AdjustedSurvival==0), 'predict'] = 0

In [31]:
#predictions changed (from simple gender predictions) for 16 males
train_data[(train_data.Category=='boy')]['predict'].sum()

16.0

In [32]:
#predictions changed for 36 females
sum(train_data[(train_data.Category=='woman')]['predict']==0)

36

### Predictions on test dataset

In [33]:
#I'll skip the cross-validation part
#By the way how does our test_data look so far
test_data.head()

Unnamed: 0,PassengerId,Name,Title,Surname,Category
0,892,"Kelly, Mr. James",Mr.,Kelly,man
1,893,"Wilkes, Mrs. James (Ellen Needs)",Mrs.,Wilkes,woman
2,894,"Myles, Mr. Thomas Francis",Mr.,Myles,man
3,895,"Wirz, Mr. Albert",Mr.,Wirz,man
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",Mrs.,Hirvonen,woman


In [34]:
#We've already extracted Title, Surname and Category in the test_data
combined_data = pd.concat([train_data, test_data], axis=0)

In [35]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       1309 non-null   int64  
 1   Name              1309 non-null   object 
 2   Survived          891 non-null    float64
 3   Title             1309 non-null   object 
 4   Surname           1309 non-null   object 
 5   Category          1308 non-null   object 
 6   SurnameFreq       354 non-null    float64
 7   Group             142 non-null    float64
 8   SurnameSurvival   142 non-null    float64
 9   AdjustedSurvival  142 non-null    float64
 10  predict           891 non-null    float64
dtypes: float64(6), int64(1), object(4)
memory usage: 122.7+ KB


In [36]:
#Ironically, there was a null value in Category in the training dataset
combined_data.loc[(combined_data.Category.isnull()), 'Category']='man'

In [37]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       1309 non-null   int64  
 1   Name              1309 non-null   object 
 2   Survived          891 non-null    float64
 3   Title             1309 non-null   object 
 4   Surname           1309 non-null   object 
 5   Category          1309 non-null   object 
 6   SurnameFreq       354 non-null    float64
 7   Group             142 non-null    float64
 8   SurnameSurvival   142 non-null    float64
 9   AdjustedSurvival  142 non-null    float64
 10  predict           891 non-null    float64
dtypes: float64(6), int64(1), object(4)
memory usage: 122.7+ KB


In [38]:
# engineer 'woman-child-groups' for entire dataset
combined_data.SurnameFreq = None
combined_data.Group = None
combined_data.SurnameSurvival = None
combined_data.AdjustedSurvival = None

In [39]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       1309 non-null   int64  
 1   Name              1309 non-null   object 
 2   Survived          891 non-null    float64
 3   Title             1309 non-null   object 
 4   Surname           1309 non-null   object 
 5   Category          1309 non-null   object 
 6   SurnameFreq       0 non-null      object 
 7   Group             0 non-null      object 
 8   SurnameSurvival   0 non-null      object 
 9   AdjustedSurvival  0 non-null      object 
 10  predict           891 non-null    float64
dtypes: float64(2), int64(1), object(8)
memory usage: 122.7+ KB


In [40]:
temp = combined_data[(combined_data.Category!='man')]
temp.head()

Unnamed: 0,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq,Group,SurnameSurvival,AdjustedSurvival,predict
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1.0,Mrs.,Cumings,woman,,,,,1.0
2,3,"Heikkinen, Miss. Laina",1.0,Miss.,Heikkinen,woman,,,,,1.0
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1.0,Mrs.,Futrelle,woman,,,,,1.0
7,8,"Palsson, Master. Gosta Leonard",0.0,Master.,Palsson,boy,,,,,0.0
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1.0,Mrs.,Johnson,woman,,,,,1.0


In [41]:
SurnameDict = temp.groupby('Surname')['PassengerId'].count().to_dict()
SurnameDict

{'Abbott': 2,
 'Abelseth': 1,
 'Abelson': 1,
 'Abrahim': 1,
 'Ahlin': 1,
 'Aks': 2,
 'Allen': 1,
 'Allison': 3,
 'Andersen-Jensen': 1,
 'Andersson': 8,
 'Andrews': 1,
 'Angle': 1,
 'Appleton': 1,
 'Arnold-Franchi': 1,
 'Asplund': 6,
 'Assaf Khalil': 1,
 'Astor': 1,
 'Attalah': 1,
 'Aubart': 1,
 'Ayoub': 1,
 'Backstrom': 1,
 'Baclini': 4,
 'Badman': 1,
 'Ball': 1,
 'Barbara': 2,
 'Barber': 1,
 'Barry': 1,
 'Baxter': 1,
 'Bazzani': 1,
 'Beane': 1,
 'Becker': 4,
 'Beckwith': 1,
 'Bentham': 1,
 'Betros': 1,
 'Bidois': 1,
 'Bird': 1,
 'Bishop': 1,
 'Bissette': 1,
 'Bonnell': 2,
 'Boulos': 3,
 'Bourke': 2,
 'Bowen': 1,
 'Bowerman': 1,
 'Bradley': 1,
 'Braf': 1,
 'Brown': 5,
 'Bryhl': 1,
 'Buckley': 1,
 'Bucknell': 1,
 'Burns': 2,
 'Buss': 1,
 'Bystrom': 1,
 'Cacic': 2,
 'Caldwell': 2,
 'Cameron': 1,
 'Canavan': 1,
 'Candee': 1,
 'Caram': 1,
 'Cardeza': 1,
 'Carr': 2,
 'Carter': 4,
 'Cassebeer': 1,
 'Cavendish': 1,
 'Chaffee': 1,
 'Chambers': 1,
 'Chapman': 1,
 'Chaudanson': 1,
 'Cherry': 1,


In [42]:
combined_data.reset_index(inplace=True)

In [43]:
combined_data.loc[(combined_data.Category!='man'), 'SurnameFreq'] = combined_data.Surname.map(SurnameDict)

In [44]:
combined_data.loc[(combined_data.SurnameFreq>1), 'Group'] = 1

In [45]:
#There are 263 woman-child-groups in the total dataset
combined_data[(combined_data.Group==1)]

Unnamed: 0,index,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq,Group,SurnameSurvival,AdjustedSurvival,predict
7,7,8,"Palsson, Master. Gosta Leonard",0.0,Master.,Palsson,boy,5.0,1,,,0.0
8,8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1.0,Mrs.,Johnson,woman,3.0,1,,,1.0
10,10,11,"Sandstrom, Miss. Marguerite Rut",1.0,Miss.,Sandstrom,woman,3.0,1,,,1.0
11,11,12,"Bonnell, Miss. Elizabeth",1.0,Miss.,Bonnell,woman,2.0,1,,,1.0
16,16,17,"Rice, Master. Eugene",0.0,Master.,Rice,boy,6.0,1,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1291,400,1292,"Bonnell, Miss. Caroline",,Miss.,Bonnell,woman,2.0,1,,,
1293,402,1294,"Gibson, Miss. Dorothy Winifred",,Miss.,Gibson,woman,2.0,1,,,
1300,409,1301,"Peacock, Miss. Treasteall",,Miss.,Peacock,woman,3.0,1,,,
1302,411,1303,"Minahan, Mrs. William Edward (Lillian E Thorpe)",,Mrs.,Minahan,woman,2.0,1,,,


In [46]:
combined_data[(combined_data.Surname=='Rice')]

Unnamed: 0,index,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq,Group,SurnameSurvival,AdjustedSurvival,predict
16,16,17,"Rice, Master. Eugene",0.0,Master.,Rice,boy,6.0,1,,,0.0
171,171,172,"Rice, Master. Arthur",0.0,Master.,Rice,boy,6.0,1,,,0.0
278,278,279,"Rice, Master. Eric",0.0,Master.,Rice,boy,6.0,1,,,0.0
787,787,788,"Rice, Master. George Hugh",0.0,Master.,Rice,boy,6.0,1,,,0.0
885,885,886,"Rice, Mrs. William (Margaret Norton)",0.0,Mrs.,Rice,woman,6.0,1,,,0.0
946,55,947,"Rice, Master. Albert",,Master.,Rice,boy,6.0,1,,,


In [47]:
#Improvise by rounding the surname survival figures

In [48]:
for item in combined_data['Surname']:
    combined_data.loc[(combined_data.Surname==item) & (combined_data.Group==1), 'SurnameSurvival'] = train_data.loc[(train_data.Surname==item) & (train_data.Group==1)]['Survived'].mean()

In [49]:
combined_data[(combined_data.SurnameSurvival.isnull()==False)]

Unnamed: 0,index,PassengerId,Name,Survived,Title,Surname,Category,SurnameFreq,Group,SurnameSurvival,AdjustedSurvival,predict
7,7,8,"Palsson, Master. Gosta Leonard",0.0,Master.,Palsson,boy,5.0,1,0.0,,0.0
8,8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1.0,Mrs.,Johnson,woman,3.0,1,1.0,,1.0
10,10,11,"Sandstrom, Miss. Marguerite Rut",1.0,Miss.,Sandstrom,woman,3.0,1,1.0,,1.0
16,16,17,"Rice, Master. Eugene",0.0,Master.,Rice,boy,6.0,1,0.0,,0.0
18,18,19,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",0.0,Mrs.,Vander Planke,woman,2.0,1,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1256,365,1257,"Sage, Mrs. John (Annie Bullen)",,Mrs.,Sage,woman,7.0,1,0.0,,
1270,379,1271,"Asplund, Master. Carl Edgar",,Master.,Asplund,boy,6.0,1,0.75,,
1276,385,1277,"Herman, Miss. Kate",,Miss.,Herman,woman,3.0,1,1.0,,
1280,389,1281,"Palsson, Master. Paul Folke",,Master.,Palsson,boy,5.0,1,0.0,,


In [50]:
#apply gender model plus new predictor to test dataset
combined_data.predict = 0
combined_data.loc[(combined_data.Category=='woman'), 'predict'] = 1
combined_data.loc[(combined_data.Category=='boy') & (combined_data.SurnameSurvival==1), 'predict'] = 1
combined_data.loc[(combined_data.Category=='woman') & (combined_data.SurnameSurvival==0), 'predict'] = 0

In [51]:
#Submitting to Kaggle
submission = combined_data[891:1309][['PassengerId', 'predict']]
submission.rename(columns = {'PassengerId': 'PassengerId', 'predict': 'Survived'}, inplace=True)
submission.to_csv('titanic_name.csv', index=False)