This exercise is based on instruction from https://pandas.pydata.org/docs/getting_started/intro_tutorials/10_text_data.html

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
titanic = pd.read_csv('titanic.csv')

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Make all names lowercase
titanic['Name'].str.lower()

0                                braund, mr. owen harris
1      cumings, mrs. john bradley (florence briggs th...
2                                 heikkinen, miss. laina
3           futrelle, mrs. jacques heath (lily may peel)
4                               allen, mr. william henry
                             ...                        
886                                montvila, rev. juozas
887                         graham, miss. margaret edith
888             johnston, miss. catherine helen "carrie"
889                                behr, mr. karl howell
890                                  dooley, mr. patrick
Name: Name, Length: 891, dtype: object

In [6]:
# create the new "sur-name" column by spliting the names by the comma
titanic['Surname'] = titanic['Name'].str.split(',').str.get(0)

In [7]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen


In [8]:
# Rearrange the columns
titanic = titanic[['PassengerId','Survived','Pclass','Surname','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]

In [9]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Surname,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Braund,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Cumings,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Heikkinen,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Futrelle,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Allen,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
titanic['Name'].str.split(',').str.get(1)

0                                  Mr. Owen Harris
1       Mrs. John Bradley (Florence Briggs Thayer)
2                                      Miss. Laina
3               Mrs. Jacques Heath (Lily May Peel)
4                                Mr. William Henry
                          ...                     
886                                    Rev. Juozas
887                           Miss. Margaret Edith
888                 Miss. Catherine Helen "Carrie"
889                                Mr. Karl Howell
890                                    Mr. Patrick
Name: Name, Length: 891, dtype: object

In [12]:
# Count the number of countesses on the Titanic
titanic['Name'].str.contains('Countess').value_counts()

False    890
True       1
Name: Name, dtype: int64

In [15]:
titanic[titanic['Name'].str.contains('Countess')]

Unnamed: 0,PassengerId,Survived,Pclass,Surname,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
759,760,1,1,Rothes,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,S


In [17]:
# Which passenger has the longest name
titanic.loc[titanic['Name'].str.len().idxmax(),"Name"]

'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'

In [18]:
# Passenger has the shortest name
titanic.loc[titanic['Name'].str.len().idxmin(), 'Name']

'Lam, Mr. Ali'

In [21]:
# Replace the sex of the passengers from male to M and female to F
titanic['sex-short'] = titanic['Sex'].replace({'male':'M','female':'F'}).head()

In [22]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Surname,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,sex-short
0,1,0,3,Braund,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,M
1,2,1,1,Cumings,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,F
2,3,1,3,Heikkinen,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,F
3,4,1,1,Futrelle,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,F
4,5,0,3,Allen,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,M
