# Titanic Data

## Summary Statistics

In [None]:
%matplotlib inline

#Importing Modules
from pandas import DataFrame, Series
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Read in Data
df = pd.read_csv('../datasets/titanic/train.csv') 


In [None]:
#Data diagnostics

# It is often a good idea to always start with a question that might affect the target variable you are trying to predict. 
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.describe(include=['O'])

In [None]:
df.columns

In [None]:
df['Survived'].mean()

In [None]:
df.groupby('Sex')['Survived'].mean() #Groupby groups dataframe with selected variables so you can perform statistics on each group

In [None]:
df.groupby('Sex')['Survived'].size() #Size counts the number in each group

In [None]:
df.corr()

In [None]:
df.groupby('Pclass')['Survived'].mean()

In [None]:
df['Fare_bins'] = pd.cut(df['Fare'],bins=[0,20,50,80,1000]) #Categorizing numerical data into bins for easy groupby
df.groupby('Fare_bins')['Survived'].mean()

In [None]:
df.groupby(['Sex','Fare_bins'])['Survived'].mean()

In [None]:
df.groupby(['Sex','Fare_bins'])['Survived'].agg([np.mean,np.size,np.std])

In [None]:
df.isnull().sum() #Counts missing values for each column

In [None]:
def countInfs(series):
    #Counts infinite values for a particular column
    if (series.dtype == 'int64') | (series.dtype == 'float64'):
        return sum((series > 1e20) | (series < -1e20))
    else:
        return 0

In [None]:
df.select_dtypes(include=[np.number]).apply(countInfs,axis=0)

In [None]:
df['SibSp'].value_counts() #Tabulates counts of each unique value

## Data Visualizations

In [None]:
#Pandas Histogram
df['Age'].hist(bins=20)
plt.title('Distribution of All Ages')
plt.xlabel('Age')
plt.ylabel('Counts')


In [None]:
#Pandas Group By Histogram with Transparency
df.groupby('Sex')['Age'].hist(bins=20,alpha=0.5)
plt.legend(labels=['Female','Male'])
plt.title('Distribution of Ages by Female and Male')

In [None]:
#Pandas Group By Density
df.groupby('Sex')['Age'].plot(kind='density')
plt.legend(labels=['Female','Male'])
plt.title('Distribution of Ages by Female and Male')

In [None]:
#Pandas Scatterplot
colors = ['blue','green','yellow']
plt.scatter(df['Age'],df['Fare'],c=df[df.Age.notnull()]['Pclass'].apply(lambda x: colors[x-1]),alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Fare Price')
plt.title('Scatterplot of Fare Price vs. Age Colored by Class')


In [None]:
#Subplots
fig, axes = plt.subplots(2,1)
df.Embarked.value_counts().plot(ax=axes[0],kind='bar')
df.groupby('Embarked')['Age'].mean().plot(ax=axes[1],kind='bar')
axes[0].set_title("Number of Passengers per Location")
axes[1].set_title("Mean Age per Location")
axes[1].set_xlabel("Location")
axes[0].set_ylabel("Counts")
axes[1].set_ylabel("Proportion")

## Prettier Data Visualizations - Seaborn

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns
#~/anaconda/bin/pip install seaborn if using anaconda, otherwise just use pip install

sns.set_style("white")

fig, axes = plt.subplots(2,1)
df.Embarked.value_counts().plot(ax=axes[0],kind='bar')
df.groupby('Embarked')['Age'].mean().plot(ax=axes[1],kind='bar')
axes[0].set_title("Number of Passengers per Location")
axes[1].set_title("Mean Age per Location")
axes[1].set_xlabel("Location")
axes[0].set_ylabel("Counts")
axes[1].set_ylabel("Proportion")

In [None]:
sns.stripplot(x="Embarked", y="Age", hue='Sex', data=df, jitter=True);
sns.plt.title("Strip Plot with Seaborn")


## Visualizing Missing Data

In [None]:
!pip install missingno

In [None]:
import missingno as msno
msno.matrix(df)