<a href="https://colab.research.google.com/github/jmarcano101/data110/blob/main/Week5_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Default title text
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
titanic_df=pd.read_csv('https://raw.githubusercontent.com/Reben80/Data110-32213/main/dataset/titanic.csv')
titanic_df.head()

In [None]:
plt.hist(titanic_df['age'])
plt.show()

# Histograms depend on the chosen bin width

In [None]:
plt.hist(titanic_df['age'],bins=60)
plt.show()

In [None]:
plt.hist(titanic_df['age'],bins=30)
plt.show()

In [None]:
plt.hist(titanic_df['age'],bins=15)
plt.show()

In [None]:
plt.hist(titanic_df['age'],bins=5)
plt.show()

In [None]:
# Create a histogram for the age distribution
plt.style.use('ggplot')
plt.figure(figsize=(10, 6))
plt.hist(titanic_df['age'], bins=15, color='skyblue',edgecolor='white')
plt.title('Age Distribution of Titanic Passengers')
plt.xlabel('Age(years)')
plt.ylabel('count')
plt.grid(axis='x')
plt.xticks([0, 20, 40, 60])
plt.tick_params(axis='x', colors='gray')  # Set x-axis tick colors to gray
plt.show()


# Alternative to histogram: Kernel density estimate (KDE)

In [None]:
sns.kdeplot(titanic_df['age'])
plt.show()


In [None]:
sns.kdeplot(titanic_df['age'], shade=True)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(titanic_df['age'], shade=True, color="red", alpha=0.7)
plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()



Histograms show raw counts, KDEs show proportions. (Total area = 1) ( Counts vs Density)

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(titanic_df['age'], bw_adjust=0.1, shade=True, color="red", alpha=1)
plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(titanic_df['age'], bw_adjust=0.4, shade=True, color="red", alpha=1)
plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(titanic_df['age'], bw_adjust=0.7, shade=True, color="red", alpha=1)
plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(titanic_df['age'], bw_adjust=2, fill=True, color="red", alpha=1)
plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()


## Careful: Are bars stacked or overlapping?

In [None]:


# This section of the code filters the 'titanic_df' DataFrame to separate passengers based on their sex and extracts their ages.
# First, it selects male passengers by checking if the 'sex' column equals 'male', then accesses their 'age' column to create a Series of ages for male passengers and stores it in 'ages_male'.
# Similarly, it selects female passengers by filtering the DataFrame where the 'sex' column equals 'female', then retrieves their 'age' column to create a Series of ages for female passengers, stored in 'ages_female'.
ages_male = titanic_df[titanic_df['sex'] == 'male']['age']
ages_female = titanic_df[titanic_df['sex'] == 'female']['age']


# Set the number of bins for the histogram
bins = 15  # You can adjust this number as needed

# Create a histogram with separate bars for males and females
plt.figure(figsize=(10, 6))
plt.hist([ages_male, ages_female], bins, label=['Male', 'Female'])

# Add titles and labels
plt.title('Age Distribution by Sex on the Titanic')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()

# Show the plot
plt.show()


In [None]:
# Create a histogram with stacked bars for males and females to visually represent the age distribution more compactly
plt.figure(figsize=(10, 6))  # Adjusting figure size
plt.hist([ages_male, ages_female],bins=15, stacked=True, edgecolor='white', label=['Male', 'Female'])

# Customize the plot
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution of Titanic Passengers by Sex')
plt.legend()
plt.grid(axis='x')
plt.xticks([0, 20, 40, 60])
# Show the plot
plt.show()



In [None]:

# Create the stacked histogram
plt.figure(figsize=(10, 6))  # Adjust figure size as desired
plt.hist(
     [ages_male, ages_female],bins=20, stacked=True,label=['Male', 'Female'],edgecolor='white')

# Customize the plot
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution of Titanic Passengers by Sex')
plt.legend()
plt.grid(axis='x')
plt.xticks([0, 20, 40, 60])
plt.tick_params(axis='x', colors='gray')  # Set x-axis tick colors to gray


In [None]:
plt.figure(figsize=(10, 6))

# Plot the male histogram on the left by multiplying by -1 to mirror the males on the left side
plt.hist(ages_male, bins=30, label='Male', alpha=0.5, color='blue', weights=[-1]*len(ages_male))

# Plot the female histogram on the right
plt.hist(ages_female, bins=30, label='Female', alpha=0.5, color='red')

# Add labels and title
plt.xlabel('Number of Passengers')
plt.ylabel('Age')
plt.title('Age Pyramid of Titanic Passengers')
plt.legend(loc='upper right')

# Fix the y-axis labels to be absolute values for clarity
plt.yticks([-60,-40,-20,0, 20, 40, 60])

plt.show()


## Alternatively: KDEs showing proportions of total

In [None]:

plt.figure(figsize=(10, 6))

# KDE for total passengers
sns.kdeplot(titanic_df['age'], bw_adjust=0.7, shade=True, color="red", alpha=1, label='Total')

# KDE for male passengers
#sns.kdeplot(titanic_df[titanic_df['sex'] == 'male']['age'], bw_adjust=0.7, shade=True, color="blue", alpha=0.7, label='Male')

plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend() # Add a legend to distinguish the lines
plt.show()


In [None]:
plt.figure(figsize=(10, 6))

# KDE for total passengers
sns.kdeplot(titanic_df['age'], bw_adjust=0.7, shade=True, color="red", alpha=1, label='Total')

# KDE for male passengers
sns.kdeplot(titanic_df[titanic_df['sex'] == 'male']['age'], bw_adjust=0.7, shade=True, color="pink", alpha=0.7, label='female')

plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend() # Add a legend to distinguish the lines
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
# KDE for total passengers
sns.kdeplot(titanic_df['age'], bw_adjust=0.7, shade=True, color="black", alpha=1, label='Total')
# KDE for total passengers
sns.kdeplot(titanic_df[titanic_df['sex'] == 'male']['age'], bw_adjust=0.7, shade=True, color="red", alpha=0.7, label='male')

# KDE for male passengers
sns.kdeplot(titanic_df[titanic_df['sex'] == 'female']['age'], bw_adjust=0.7, shade=True, color="skyblue", alpha=0.7, label='female')

plt.title('Kernel Density Estimate of Age for Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend() # Add a legend to distinguish the lines
plt.show()

## Pie Chart

In [None]:
# Now, create a pie chart with labels
sex_counts = titanic_df['sex'].value_counts()
labels = sex_counts.index  # This should give you ['male', 'female'] or the other way around depending on your dataset
plt.figure(figsize=(8, 8))  # Optional: Specifies the figure size
plt.pie(sex_counts, labels=labels,)  # autopct shows the percentage of each slice
plt.title('Distribution of Passengers by Sex')  # Adds a title to the pie chart
plt.legend()
plt.show()

In [None]:
DT=sns.load_dataset('titanic')