# Initial Data Exploration

This notebook was used to explore variables in the dataset and make necessary modifications. 

Once complete, the inital data download and all useful modifications were combined in a function and included in the accompanying dataworkflow package.

# Setup Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
path = '/Users/jco/Desktop/data_science/Udacity/project_2/Titanic/train.csv'

titanic = pd.read_csv(path)

# Explore Data

__Dimension/Type__

In [3]:
titanic.shape

(891, 12)

In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


__Sample__

In [5]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


__Summary Stats__

In [6]:
#Numerical Summary
titanic.describe() 

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
#Categorical Summary
cat_data = titanic.dtypes[titanic.dtypes == 'object'].index

titanic[cat_data].describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Cor, Mr. Liudevit",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


# Evaluate Variables

__PassangerID__

* Full set of data (891)
* Unique numerical values for every passenger
* Does not appear relevant for analysis 

_Decision_: __Delete__

In [8]:
titanic['PassengerId'].describe()

count    891.000000
mean     446.000000
std      257.353842
min        1.000000
25%      223.500000
50%      446.000000
75%      668.500000
max      891.000000
Name: PassengerId, dtype: float64

__Survived__

* Full set of data (891)
* Categorical values stored as numbers - 1(lived), 0(Died)
* Key field for analysis

Decision: __Modify__ convert to category, replace nums with text (Died, Lived)

In [9]:
titanic['Survived'].describe()

count    891.000000
mean       0.383838
std        0.486592
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

__Pclass__

* Full set of data
* Categorical Data stored as numerical 1=1st Class, 2=2nd Class, 3=3rd Class
* Interesting variable for analysis

Decision: __Modify__ convert to ordered category, replace nums with text (1st, 2nd, 3rd)

In [10]:
titanic['Pclass'].describe()

count    891.000000
mean       2.308642
std        0.836071
min        1.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        3.000000
Name: Pclass, dtype: float64

__Name__

* No missing values
* All unique 
* Seperate variables exist for identifying passangers w/ family members
* Does not appear relevant for analysis 

Decision: __Delete__

In [11]:
titanic['Name'].describe()

count                   891
unique                  891
top       Cor, Mr. Liudevit
freq                      1
Name: Name, dtype: object

__Sex__

* No missing values
* Stored as object vs. category
* Interesting variable for analysis

Decision: __Modify__ keep values, convert dtype to category

In [12]:
titanic['Sex'].describe()

count      891
unique       2
top       male
freq       577
Name: Sex, dtype: object

In [13]:
titanic['Sex'].dtype

dtype('O')

__Age__

* Missing data (177, 714/891)
* Contains estimates for ages under 1 
* interesting var, can also be used to create alternate interesting vars

Decision: __Transform__ keep current vals, keep missing data empyt for now, Create new variable for Child, Adults(>18), and Unknown 

In [14]:
titanic['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

__SibSp__

* Variable for # of sibling or spouses aboard
* No missing values
* Would be interesting to use for identification of passengers with family

Decision: __Merge__ combine with Parch to create new variables identifying passengers traveling with family and how many

In [15]:
titanic['SibSp'].describe()

count    891.000000
mean       0.523008
std        1.102743
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64

__Parch__

* variable for # of parents or childeren aboard
* no missing values
* Would be interesting to use for identification of passengers w/ fam

Decision: __Merge__ combine with SibSp to create new var identifying passengers w/ fam & count

In [16]:
titanic['Parch'].describe()

count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

__Ticket__

* No missing values
* Mix of unique and duplicates (681 uni, 310 dupes)
* Mix of alphanumeric and numeric values
* Does not appear relevant for analysis 

Decision: __Delete__

In [17]:
titanic['Ticket'].describe()

count          891
unique         681
top       CA. 2343
freq             7
Name: Ticket, dtype: object

In [18]:
titanic['Ticket'].head(15)

0            A/5 21171
1             PC 17599
2     STON/O2. 3101282
3               113803
4               373450
5               330877
6                17463
7               349909
8               347742
9               237736
10             PP 9549
11              113783
12           A/5. 2151
13              347082
14              350406
Name: Ticket, dtype: object

__Fare__

* No missing values 
* Relevent for analysis 

Decision: __Keep__

In [19]:
titanic['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

__Cabin__

* Missing large number of values (687, 204/891)
* Mix of unique(147) and duplicated(57) data
* Appears to be too little data for analysis

Decision: __Delete__

In [20]:
titanic['Cabin'].describe()

count             204
unique            147
top       C23 C25 C27
freq                4
Name: Cabin, dtype: object

In [21]:
titanic['Cabin'].head(15)

0      NaN
1      C85
2      NaN
3     C123
4      NaN
5      NaN
6      E46
7      NaN
8      NaN
9      NaN
10      G6
11    C103
12     NaN
13     NaN
14     NaN
Name: Cabin, dtype: object

__Embarked__

* Missing 2 values
* Categorical stored as Object
* Useful for analysis

Decision: __Modify__ keep values, update dtype to category

In [22]:
titanic['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [23]:
titanic['Embarked'].dtype

dtype('O')

# Update/Add Variables

__Numerical Category Data__

In [24]:
#Survived
new_survived = pd.Categorical(titanic['Survived'])
new_survived = new_survived.rename_categories(['Died', 'Lived'])

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,549,0.616162
Lived,342,0.383838


In [25]:
#Pclass
new_pclass = pd.Categorical(titanic['Pclass'], ordered=True)
new_pclass = new_pclass.rename_categories(['1st','2nd','3rd'])

new_pclass.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
1st,216,0.242424
2nd,184,0.20651
3rd,491,0.551066


In [26]:
#Replace with categorical data
titanic['Survived'] = new_survived
titanic['Pclass'] = new_pclass

#sanity check
print (titanic['Survived'].dtype, titanic['Pclass'].dtype)

category category


__Update dtype__

In [27]:
titanic['Sex'] = pd.Series(titanic['Sex'], dtype='category')
titanic['Embarked'] = pd.Series(titanic['Embarked'], dtype='category')

#sanity check
print(titanic['Sex'].dtype, titanic['Embarked'].dtype)

category category


In [28]:
titanic['Sex'].unique()
titanic['Sex'].cat.categories

Index(['female', 'male'], dtype='object')

__Create New Variables__

In [29]:
#Family Count
titanic['FamilyTot'] = titanic['SibSp'] + titanic['Parch']

In [30]:
#Single or Family Variable
bins = [-1,0,np.inf]
labels = ['Single','Family']
fam_status = pd.cut(titanic['FamilyTot'], bins, labels=labels)

fam_status.describe() #sanity check

count        891
unique         2
top       Single
freq         537
Name: FamilyTot, dtype: object

In [31]:
#Add FamStatus
titanic['FamStatus'] = fam_status

In [32]:
#Adult (>18) or Child (<18) Var
bins = [0, 17, 1000, np.inf]
labels = ['Child', 'Adult', 'Unknown']
age_groups = pd.cut(titanic['Age'], bins, labels=labels) 

age_groups.describe() #sanity check

count       714
unique        2
top       Adult
freq        601
Name: Age, dtype: object

In [33]:
#Add age_groups
titanic['age_group'] = age_groups

In [34]:
titanic.head() #sanity check

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilyTot,FamStatus,age_group
0,1,Died,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,Family,Adult
1,2,Lived,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Family,Adult
2,3,Lived,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,Single,Adult
3,4,Lived,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Family,Adult
4,5,Died,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Single,Adult


# Delete Variables

In [35]:
# Remove PassengerId, Name, Ticket, Cabin

titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [36]:
titanic.head() #sanity

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilyTot,FamStatus,age_group
0,Died,3rd,male,22.0,1,0,7.25,S,1,Family,Adult
1,Lived,1st,female,38.0,1,0,71.2833,C,1,Family,Adult
2,Lived,3rd,female,26.0,0,0,7.925,S,0,Single,Adult
3,Lived,1st,female,35.0,1,0,53.1,S,1,Family,Adult
4,Died,3rd,male,35.0,0,0,8.05,S,0,Single,Adult
