## Data Wrangling

In [1]:
import pandas as pd

url = 'https://tinyurl.com/titanic-csv'

df = pd.read_csv(url)

In [2]:
df.head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [3]:
dataframe = pd.DataFrame()

dataframe['Name'] = ['Matt', 'Paige']
dataframe['Age'] = [32,30]
dataframe['Ninja'] = [True, False]

dataframe

Unnamed: 0,Name,Age,Ninja
0,Matt,32,True
1,Paige,30,False


In [4]:
new_member = pd.Series(['Nelson',31, False], index=['Name','Age','Ninja'])

dataframe.append(new_member, ignore_index=True)

Unnamed: 0,Name,Age,Ninja
0,Matt,32,True
1,Paige,30,False
2,Nelson,31,False


In [5]:
df.shape

(1313, 6)

In [6]:
df.describe()

Unnamed: 0,Age,Survived,SexCode
count,756.0,1313.0,1313.0
mean,30.397989,0.342727,0.351866
std,14.259049,0.474802,0.477734
min,0.17,0.0,0.0
25%,21.0,0.0,0.0
50%,28.0,0.0,0.0
75%,39.0,1.0,1.0
max,71.0,1.0,1.0


In [7]:
df.iloc[1:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [8]:
df = df.set_index(df['Age'])

age_22 = df.loc[22]

In [9]:
age_22.head(5)

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22.0,"Bowerman, Miss Elsie Edith",1st,22.0,female,1,1
22.0,"Frolicher, Miss Marguerite",1st,22.0,female,1,1
22.0,"Gibson, Miss Dorothy",1st,22.0,female,1,1
22.0,"Ostby, Miss Helen Raghnild",1st,22.0,female,1,1
22.0,"Payne, Mr Vivian Ponsonby",1st,22.0,male,0,0


In [10]:
df = age_22.set_index(age_22['Sex'])

female = df.loc['female']

In [11]:
female

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,"Bowerman, Miss Elsie Edith",1st,22.0,female,1,1
female,"Frolicher, Miss Marguerite",1st,22.0,female,1,1
female,"Gibson, Miss Dorothy",1st,22.0,female,1,1
female,"Ostby, Miss Helen Raghnild",1st,22.0,female,1,1
female,"Cook, Mrs Selena Rogers",2nd,22.0,female,0,1
female,"del Carlo, Mrs Sebastiano (Argenia Genovese)",2nd,22.0,female,1,1
female,"Karnes, Mrs J Frank (Claire Bennett)",2nd,22.0,female,0,1
female,"LaRoche, Mrs Joseph (Juliet)",2nd,22.0,female,1,1
female,"Connolly, Miss Kate",3rd,22.0,female,1,1
female,"Dahlberg, Miss Gerda Ulrika",3rd,22.0,female,0,1


In [12]:
df[(df['Sex'] == 'male') & (df['Age'] == 22) & (df['Survived'] == 0)]

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
male,"Payne, Mr Vivian Ponsonby",1st,22.0,male,0,0
male,"Giles, Mr Ralph",2nd,22.0,male,0,0
male,"Smith (Schmidt), Mr Augustus",2nd,22.0,male,0,0
male,"Troupiansky, Mr Moses Aaron",2nd,22.0,male,0,0
male,"Waelens, Mr Achille",2nd,22.0,male,0,0
male,"Barton, Mr David",3rd,22.0,male,0,0
male,"Berglund, Mr Karl Ivar Sven",3rd,22.0,male,0,0
male,"Braund, Mr Owen Harris",3rd,22.0,male,0,0
male,"Brobek, Mr Karl Rudolf",3rd,22.0,male,0,0
male,"Davies, Mr Evan",3rd,22.0,male,0,0


In [13]:
df['Sex'].replace(['female','male'],['Woman','Man']).head(5)

Sex
female    Woman
female    Woman
female    Woman
female    Woman
male        Man
Name: Sex, dtype: object

In [14]:
df.rename(columns={'PClass': 'Passenger Class'}).head(5)

Unnamed: 0_level_0,Name,Passenger Class,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,"Bowerman, Miss Elsie Edith",1st,22.0,female,1,1
female,"Frolicher, Miss Marguerite",1st,22.0,female,1,1
female,"Gibson, Miss Dorothy",1st,22.0,female,1,1
female,"Ostby, Miss Helen Raghnild",1st,22.0,female,1,1
male,"Payne, Mr Vivian Ponsonby",1st,22.0,male,0,0


In [15]:
df.replace([r"1st",r"2nd",r"3rd"], ['First','Second','Third'], regex=True).head(10)

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,"Bowerman, Miss Elsie Edith",First,22.0,female,1,1
female,"Frolicher, Miss Marguerite",First,22.0,female,1,1
female,"Gibson, Miss Dorothy",First,22.0,female,1,1
female,"Ostby, Miss Helen Raghnild",First,22.0,female,1,1
male,"Payne, Mr Vivian Ponsonby",First,22.0,male,0,0
female,"Cook, Mrs Selena Rogers",Second,22.0,female,0,1
female,"del Carlo, Mrs Sebastiano (Argenia Genovese)",Second,22.0,female,1,1
male,"Giles, Mr Ralph",Second,22.0,male,0,0
female,"Karnes, Mrs J Frank (Claire Bennett)",Second,22.0,female,0,1
female,"LaRoche, Mrs Joseph (Juliet)",Second,22.0,female,1,1


In [16]:
url = 'https://tinyurl.com/titanic-csv'

df = pd.read_csv(url)

In [17]:
print("Maximum: ", df['Age'].max())
print("Minimum: ", df['Age'].min())
print('Mean: ', df['Age'].mean())
print('Count: ', df['Age'].count())

('Maximum: ', 71.0)
('Minimum: ', 0.17)
('Mean: ', 30.397989417989415)
('Count: ', 756)


In [18]:
data = df.copy()

In [19]:
data.shape

(1313, 6)

In [20]:
data.count()

Name        1313
PClass      1313
Age          756
Sex         1313
Survived    1313
SexCode     1313
dtype: int64

In [21]:
data['Sex'].unique()

array(['female', 'male'], dtype=object)

In [22]:
data['Sex'].value_counts()

male      851
female    462
Name: Sex, dtype: int64

In [23]:
data['PClass'].value_counts()

3rd    711
1st    322
2nd    279
*        1
Name: PClass, dtype: int64

In [24]:
data['PClass'].nunique()

4

In [25]:
data[data['Age'].isnull()].head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0
14,"Baumann, Mr John D",1st,,male,0,0
29,"Borebank, Mr John James",1st,,male,0,0
32,"Bradley, Mr George",1st,,male,1,0


In [26]:
data.drop('SexCode', axis=1).head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0
4,"Allison, Master Hudson Trevor",1st,0.92,male,1


In [28]:
data[data['Sex'] != 'female'].head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
5,"Anderson, Mr Harry",1st,47.0,male,1,0
7,"Andrews, Mr Thomas, jr",1st,39.0,male,0,0
9,"Artagaveytia, Mr Ramon",1st,71.0,male,0,0


In [29]:
data.drop_duplicates().head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [30]:
data.drop_duplicates(subset=['Sex'])

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


In [31]:
data.drop_duplicates(subset=['Sex'], keep='last')

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1307,"Zabour, Miss Tamini",3rd,,female,0,1
1312,"Zimmerman, Leo",3rd,29.0,male,0,0


In [34]:
data.groupby('Sex').mean()

Unnamed: 0_level_0,Age,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,29.396424,0.666667,1.0
male,31.014338,0.166863,0.0


In [35]:
data.groupby(['Sex','Survived'])['Age'].mean()

Sex     Survived
female  0           24.901408
        1           30.867143
male    0           32.320780
        1           25.951875
Name: Age, dtype: float64

In [40]:
for name in data['Name'][0:5]:
    print(name.upper())

ALLEN, MISS ELISABETH WALTON
ALLISON, MISS HELEN LORAINE
ALLISON, MR HUDSON JOSHUA CREIGHTON
ALLISON, MRS HUDSON JC (BESSIE WALDO DANIELS)
ALLISON, MASTER HUDSON TREVOR


In [53]:
[name for name in data['Name'][0:5]]

['Allen, Miss Elisabeth Walton',
 'Allison, Miss Helen Loraine',
 'Allison, Mr Hudson Joshua Creighton',
 'Allison, Mrs Hudson JC (Bessie Waldo Daniels)',
 'Allison, Master Hudson Trevor']

In [57]:
def upperCase(x):
    return x.upper()

In [58]:
data["Name"].apply(upperCase)[0:5]

0                     ALLEN, MISS ELISABETH WALTON
1                      ALLISON, MISS HELEN LORAINE
2              ALLISON, MR HUDSON JOSHUA CREIGHTON
3    ALLISON, MRS HUDSON JC (BESSIE WALDO DANIELS)
4                    ALLISON, MASTER HUDSON TREVOR
Name: Name, dtype: object

In [59]:
data.groupby('Sex').apply(lambda x: x.count())

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,462,462,288,462,462,462
male,851,851,468,851,851,851
