In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
tips_df = sns.load_dataset('tips')
tips_df.head()
tips_df['tip_percent'] = ((tips_df['tip'] / tips_df['total_bill']) * 100).round(2)

days_data = tips_df.groupby('day', observed=True)['total_bill']
print(tips_df['day'].unique())
mean_data = days_data.agg(['count', 'mean'])
highest_bill_day = mean_data['mean'].idxmax()
print(f"Day with highest mean bill: {highest_bill_day} (${mean_data.loc[highest_bill_day, 'mean']:.2f})")

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

Day with highest mean bill: Sun ($21.41)


In [25]:
time_counts = tips_df.groupby('time', observed=True).size().to_frame('count')
dinner_count = time_counts.loc['Dinner', 'count']
lunch_count = time_counts.loc['Lunch', 'count']

smoker_time = tips_df.groupby(['time', 'smoker'], observed=True).size().unstack(fill_value=0)
smoker_totals = tips_df.groupby('time', observed=True)['smoker'].value_counts().unstack()
total_by_time = smoker_totals.sum(axis=1)
smoker_by_time = smoker_totals['Yes']

combined_df = pd.DataFrame({
    'total_count': time_counts['count'],
    'smokers': smoker_by_time
})

combined_df['smoker_percent'] = (combined_df['smokers'] / combined_df['total_count'] * 100).round(2)

print(f"Combined analysis:\n{combined_df}")

Combined analysis:
        total_count  smokers  smoker_percent
time                                        
Lunch            68       23           33.82
Dinner          176       70           39.77


In [30]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='sex', y='tip', data=tips_df)
plt.xlabel('Sex')
plt.ylabel('Tip Amount ($)')
plt.title('Distribution of Tips by Sex')
plt.savefig('./3_4_boxplot_tip_sex.png', dpi=300, bbox_inches='tight')
plt.close()

male_tips = tips_df[tips_df['sex'] == 'Male']['tip']
female_tips = tips_df[tips_df['sex'] == 'Female']['tip']

def count_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return len(data[(data < lower) | (data > upper)])

male_outliers = count_outliers(male_tips)
female_outliers = count_outliers(female_tips)

print(f"Male outliers: {male_outliers}")
print(f"Female outliers: {female_outliers}")

Male outliers: 6
Female outliers: 1


In [None]:
tips_filtered = tips_df[tips_df['tip_percent'] < 70]
plt.figure(figsize=(10, 6))
sns.boxplot(x='sex', y='tip_percent', data=tips_filtered)
plt.xlabel('Sex')
plt.ylabel('Tip Percentage (%)')
plt.title('Distribution of Tip Percentage by Sex (< 70%)')
plt.savefig('./3_5_boxplot_tip_percent_by_sex.png', dpi=300, bbox_inches='tight')
plt.close()

male_tip_percentage = tips_filtered[tips_filtered['sex'] == 'Male']['tip_percent']
female_tip_percentage = tips_filtered[tips_filtered['sex'] == 'Female']['tip_percent']

male__percentage_outliers = count_outliers(male_tip_percentage)
female__percentage_outliers = count_outliers(female_tip_percentage)

print(f"Outlier and symmetry analysis:")
print(f"Male outliers: {male__percentage_outliers}")
print(f"Female outliers: {female__percentage_outliers}")



Outlier and symmetry analysis:
  Male outliers: 1
  Female outliers: 6
