In [119]:
import numpy as np
import pandas as pd                 # library for dataframes
import datetime                     # library for dates
import matplotlib.pyplot as plt     # library for plotting
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
import matplotlib.font_manager as font_manager
import seaborn as sns
from helper_function import preprocessing
from analyse_average_washing_time import *
from tueplots import bundles
from tueplots.constants.color import rgb
from scipy.stats import ttest_ind
from matplotlib.lines import Line2D

In [120]:
# Setting parameters for the plotting libraries
## Mostly make up, please ignore
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Microsoft Sans Serif'
plt.rc('axes', labelsize=18)            # fontsize of the x and y labels
plt.rc('xtick', labelsize=17)           # fontsize of the tick labels
plt.rc('ytick', labelsize=17)           # fontsize of the tick labels
plt.rc('legend', fontsize=17)           # legend fontsize
plt.rc('font', size=17)                 # controls default text sizes
plt.rc('font', family='serif', serif='Times New Roman')
sns.set_style('whitegrid')
sns.set_style({'font.family':'serif', 'font.serif':'Times New Roman'})

# Setting path where data is located
data_path = '../data/anonymized.csv'
# Setting path for saving plots
plot_path = '../plots/'

In [121]:
# Setting useful parameter
date_format = '%d.%m.%Y'
date_form = DateFormatter(date_format)
# Change this for lower or higher resolution
resolution = 600
custom_order = ['Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So']
custom_order_eng = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
day_mapping = {'Mo': 'Mon', 'Di': 'Tue', 'Mi': 'Wed', 'Do': 'Thu', 'Fr': 'Fri', 'Sa': 'Sat', 'So': 'Sun'}

In [122]:
# Loading dataset and preprocessing
original_data = pd.read_csv(data_path)
# Preprocessing the data
data = preprocessing(data_frame=original_data, date_format=None)
data

Unnamed: 0,weekday,date,start_time,end_time,duration(in h),pseudonym,sex,machine
0,Mo,2022-12-05,12:00,13:00,1.0,F1,f,2
1,Mo,2022-12-05,13:00,14:00,1.0,F2,f,2
2,Di,2022-12-06,12:00,13:00,1.0,M1,m,1
3,Di,2022-12-06,13:00,14:30,1.5,M2,m,1
4,Di,2022-12-06,14:30,15:30,1.0,M3,m,1
...,...,...,...,...,...,...,...,...
2341,Mo,2023-12-04,10:30,11:30,1.0,M36,m,2
2342,Mo,2023-12-04,11:30,12:30,1.0,M7,m,2
2343,Mo,2023-12-04,15:00,16:00,1.0,M1,m,2
2344,Mo,2023-12-04,17:20,18:20,1.0,M44,m,1


In [123]:
def sum_usage_per_day(df):
    '''
    Summing up the usage time per day per data point.
    :param df (pd.DataFrame):
    :return:
    '''
    daily_usage = df.groupby('date')['duration(in h)'].sum()
    return daily_usage

In [124]:
daily_usage = sum_usage_per_day(data.copy())
# Calculate moving average with a window size of 7 days
moving_average = daily_usage.rolling(window=7).mean()
average = daily_usage.mean()
deviation = abs(average - daily_usage).rolling(window=7).mean()

In [125]:
# Setting start and end date
start_date = data['date'][0]
end_date = data['date'][2345]
# Setting dates of the semester breaks
xmas_start = datetime.datetime(2022, 12, 23)
xmas_end = datetime.datetime(2023, 1, 7)
ss_start = datetime.datetime(2023, 4, 17)
ss_end = datetime.datetime(2023, 7, 29)
easter_start = datetime.datetime(2023, 5, 29)
easter_end = datetime.datetime(2023, 6, 3)
ws_start = datetime.datetime(2023, 9, 16)
ws_end = datetime.datetime(2023, 2, 11)

In [126]:
# Defining the opacity for the mark of the semester breaks
opacity = 0.2

In [None]:
fig, ax = plt.subplots(figsize=(17, 10))
# Plotting break times
ax.axvspan(xmin=xmas_start, xmax=xmas_end, alpha=opacity, color=rgb.tue_lightblue, label='Midterm break')
ax.axvspan(xmin=easter_start, xmax=easter_end, alpha=opacity, color=rgb.tue_lightblue)
ax.axvspan(xmin=ws_end, xmax=ss_start, color=rgb.tue_lightgreen, alpha=opacity, label='Semester break')
ax.axvspan(xmin=ss_end, xmax=ws_start, color=rgb.tue_lightgreen, alpha=opacity)
#
ax.xaxis.set_major_formatter(date_form)
# making histogram
n, bins, patches = ax.hist(data['date'], bins=300, range=(start_date, end_date), width=1.0)
# Ensure ticks fall once every other week (interval=4)
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=4))
# plot moving average and total average
ax.plot(moving_average, label='Moving average')
ax.axhline(y=average, color='blue', linestyle='--', label='Average')
fig.autofmt_xdate()
fig.legend()
fig.savefig(f'{plot_path}data-for-year.png', dpi=resolution)

  fig.autofmt_xdate()


In [None]:
occupancy_per_weekday = data.copy().groupby(['sex', 'weekday']).size()
all_occupancy = data.copy().groupby(['weekday']).size()
male_occupancy = occupancy_per_weekday.copy().loc['m'].reset_index(name='occurrences')
female_occupancy = occupancy_per_weekday.copy().loc['f'].reset_index(name='occurrences')

In [None]:
# Group data by 'sex' and 'weekday', count occurrences, and reshape the result
absolute_values = data.groupby(['sex', 'weekday']).size().unstack(level=0).fillna(0).astype(int)

# Reorder the DataFrame based on 'custom_order' and melt it for the desired structure
absolute_values = absolute_values.loc[custom_order].reset_index().melt(id_vars='weekday', var_name='Gender', value_name='Count')

# Set the 'weekday' column as a categorical with the custom order
absolute_values['weekday'] = pd.Categorical(absolute_values['weekday'], categories=custom_order, ordered=True)
absolute_values.rename(columns={'weekday': 'Day'}, inplace=True)

absolute_values['Day'] = absolute_values['Day'].replace(day_mapping)
absolute_values = absolute_values[absolute_values['Gender'] != 'u']
absolute_values['Gender'] = absolute_values['Gender'].replace({'f': 'Females', 'm' : 'Males'})

In [None]:
# Transform datasets to a probability distribution
male_occupancy['occurrences'] /= male_occupancy['occurrences'].sum()
female_occupancy['occurrences'] /= female_occupancy['occurrences'].sum()
all_occupancy /= all_occupancy.sum()
all_occupancy = all_occupancy.reset_index(name='occurrences')

# Convert 'weekday' column to Categorical with custom order and sort by 'weekday'
for df in [male_occupancy, female_occupancy, all_occupancy]:
    df['weekday'] = pd.Categorical(df.weekday, categories=custom_order, ordered=True)
    df.sort_values('weekday', inplace=True)

# Preparing dataframe for English day representation
df = pd.DataFrame({'Day': custom_order_eng, 'Males': list(male_occupancy['occurrences']),
                   'Females': list(female_occupancy['occurrences']), 'All': list(all_occupancy['occurrences'])}, index=custom_order)

# Melt the dataframe to create a "gender" column
df_melted = pd.melt(df, id_vars=['Day'], value_vars=['Females', 'Males', 'All'], var_name='Gender', value_name='Percentage')
df_melted.Percentage = df_melted.Percentage * 100

In [None]:
fig, ax = plt.subplots(figsize=(16, 10))
# Plotting using Seaborn
ax = sns.barplot(x='Day', y='Percentage', hue='Gender', data=df_melted, palette={'Females': rgb.tue_red, 'Males': rgb.tue_blue, 'All': rgb.tue_green}, alpha=0.5)
ax.axhline(y=100/7, color='blue', linestyle='--')
ax.set_yticks([0.0, 2, 4, 6, 8, 10, 12, 14.3, 16])
ax2 = ax.twinx()
ax2.set_ylim([0, 400])
ax2.set_yticks([0, 30, 50, 70, 90, 110, 130, 150, 170, 190])
ax2 = sns.barplot(x='Day', y='Count', hue='Gender', data=absolute_values, alpha=0.5, palette={'Females': rgb.tue_red, 'Males': rgb.tue_blue})
ax2.legend_.remove()
ax2.set_ylabel('Absolute numbers (accumulated)')
fig.savefig(f'{plot_path}Data_per_weekday.png', dpi=resolution)

In [None]:
plt.rcParams.update(bundles.icml2022(column='full', nrows=1, ncols=2, usetex=False))
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 8
plt.rcParams['font.family'] = 'Times New Roman'

In [None]:
weekdays = ['Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So']
weekdays_dict = {
    'Mo': "Monday",
    'Di': "Tuesday",
    'Mi': "Wednesday",
    'Do': "Thursday",
    'Fr': "Friday",
    'Sa': "Saturday",
    'So': "Sunday"
}

weekdays_data_dict_female = {
    'Mo': "time_sum_monday_w",
    'Di': "time_sum_tuesday_w",
    'Mi': "time_sum_wednesday_w",
    'Do': "time_sum_thursday_w",
    'Fr': "time_sum_friday_w",
    'Sa': "time_sum_saturday_w",
    'So': "time_sum_sunday_w"
}

weekdays_data_dict_male = {
    'Mo': "time_sum_monday_m",
    'Di': "time_sum_tuesday_m",
    'Mi': "time_sum_wednesday_m",
    'Do': "time_sum_thursday_m",
    'Fr': "time_sum_friday_m",
    'Sa': "time_sum_saturday_m",
    'So': "time_sum_sunday_m"
}

In [None]:
data = data.drop_duplicates(subset=['pseudonym', 'date', 'start_time'])
men_data = data.loc[data['sex'] == 'm']
women_data = data.loc[data['sex'] == 'f']

all_data_men = np.zeros(96, dtype=int)
all_data_women = np.zeros(96, dtype=int)

# Sum of all washing machine usages per day and in general
for index_day, day in enumerate(weekdays):
    woman_data_weekday = women_data.loc[women_data['weekday'] == day]
    male_data_weekday = men_data.loc[men_data['weekday'] == day]

    weekdays_data_dict_female[day] = np.zeros(96, dtype=int)
    weekdays_data_dict_male[day] = np.zeros(96, dtype=int)

    for time_i in woman_data_weekday['start_time']:
        ind = int(int(time_i.split(':')[0]) * 4 + (int(time_i.split(':')[1]) / 15))
        weekdays_data_dict_female[day][ind] += 1

    for time_i in male_data_weekday['start_time']:
        ind = int(int(time_i.split(':')[0]) * 4 + (int(time_i.split(':')[1]) / 15))
        weekdays_data_dict_male[day][ind] += 1

    all_data_men = all_data_men + weekdays_data_dict_male[day]
    all_data_women = all_data_women + weekdays_data_dict_female[day]

In [None]:
intervals = list(range(192))
fig, ax = plt.subplots(nrows=7, figsize=(10, 25), sharex=False, dpi=300, zorder=4)
zeroes_array = np.zeros(96, dtype=int)

time_stamps = np.linspace(0, 192, 49)
time_labels = ["00:00", "", "02:00", "", "04:00", "", "06:00", "", "08:00", "", "10:00", "", "12:00", "", "14:00", "", "16:00", "", "18:00", "", "20:00", "", "22:00", "", "00:00", "", "02:00", "", "04:00", "", "06:00", "", "08:00", "", "10:00", "", "12:00", "", "14:00", "", "16:00", "", "18:00", "", "20:00", "", "22:00", "", ""]

ax[6].set_xlabel("Time")
ax[3].set_ylabel("Frequency")
plt.subplots_adjust(hspace=0.45, bottom=0.2)

y_pos = [2, 4, 6, 8, 10, 12, 14]

# Set x- and y-axis
for subplot in ax:
    subplot.set_xlim(0, 192)
    subplot.set_ylim(0, 14)
    subplot.set_xticks(time_stamps)
    subplot.set_xticklabels(labels=time_labels, rotation=30)
    subplot.set_yticks(np.arange(0, 13, step=2))
    for yc in y_pos:
        subplot.axhline(y=yc, color='grey', linewidth=0.6, zorder=2)
    subplot.axvline(x=96, color='black', linewidth=1, zorder=4)

# Creates the plots for all weekdays
for index, day in enumerate(weekdays_data_dict_female):

    ax[index].bar(intervals, np.concatenate((zeroes_array, weekdays_data_dict_male[day]), axis=0), label='Set1', width=0.9, color=rgb.tue_blue, align='edge', zorder=3, alpha=0.5)
    ax[index].bar(intervals, np.concatenate((weekdays_data_dict_female[day], zeroes_array), axis=0), label='Set1', width=0.9, color=rgb.tue_red, align='edge', zorder=3, alpha=0.5)

    ax[index].errorbar(x=np.round(np.average(np.arange(96), weights=weekdays_data_dict_male[day]))+96, y=np.average(weekdays_data_dict_male[day]), xerr=np.std(weekdays_data_dict_male[day]), linewidth=2, capsize=6, zorder=4, color=rgb.tue_blue, ecolor=rgb.tue_blue)
    ax[index].errorbar(x=np.round(np.average(np.arange(96), weights=weekdays_data_dict_female[day])), y=np.average(weekdays_data_dict_female[day]), xerr=np.std(weekdays_data_dict_female[day]), linewidth=2, capsize=6, zorder=4, color=rgb.tue_red, ecolor=rgb.tue_red)

    ax[index].axvline(x=np.round(np.average(np.arange(96), weights=weekdays_data_dict_male[day]))+96, color=rgb.tue_blue, linewidth=1, zorder=4, linestyle='dashed')
    ax[index].axvline(x=np.round(np.average(np.arange(96), weights=weekdays_data_dict_female[day])), color=rgb.tue_red, linewidth=1, zorder=4, linestyle='dashed')

    max_times_woman = np.flatnonzero(weekdays_data_dict_female[day] == np.max(weekdays_data_dict_female[day])).tolist()
    for index_times, times in enumerate(max_times_woman):
        max_times_woman[index_times] = f"{3*times // 12:02d}:{(3*times % 12) * 5:02d}"

    if len(max_times_woman) == 1:
        plural_w = "Time"
    else:
        plural_w = "Times"

    max_times_man = np.flatnonzero(weekdays_data_dict_male[day] == np.max(weekdays_data_dict_male[day])).tolist()
    for index_times, times in enumerate(max_times_man):
        max_times_man[index_times] = f"{3*times // 12:02d}:{(3*times % 12) * 5:02d}"

    if len(max_times_man) == 1:
        plural_m = "Time"
    else:
        plural_m = "Times"

    custom_lines = [Line2D([0], [0], color=rgb.tue_red, lw=2, alpha=0.5),
                Line2D([0], [0], color=rgb.tue_red, lw=2, linestyle='dashed'),
                Line2D([0], [0], color=rgb.tue_red, lw=2),
                Line2D([0], [0], color=rgb.tue_blue, lw=2, alpha=0.5),
                Line2D([0], [0], color=rgb.tue_blue, lw=2, linestyle='dashed'),
                Line2D([0], [0], color=rgb.tue_blue, lw=2)]
    ax[0].legend(custom_lines, ['Female Students', 'Female Average', 'Female Std.', 'Male Students', 'Male Average', 'Male Std.'], prop={'size': 7}, loc='upper left')
    ax[index].set_title(f"{weekdays_dict[day]}")

fig.savefig(f'{plot_path}daily_distribution.png')

In [None]:
# Independent Samples t-Test
data_men = all_data_men.tolist()
data_women = all_data_women.tolist()
result = ttest_ind(all_data_men, all_data_women)
print(result.pvalue)

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(10, 3), sharex=False, dpi=300, zorder=5)

y_pos = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

ax.set_xlim(0, 192)
ax.set_ylim(0, 50)
ax.set_xticks(time_stamps)
ax.set_xticklabels(labels=time_labels, rotation=30)
ax.set_yticks(np.arange(0, 50, step=10))
for yc in y_pos:
    ax.axhline(y=yc, color='grey', linewidth=0.6, zorder=2, alpha=0.2)
ax.axvline(x=96, color='black', linewidth=1, zorder=4)

ax.set_xlabel("Time")
ax.set_ylabel("Frequency")

ax.bar(intervals, np.concatenate((zeroes_array, all_data_men), axis=0), label='Set1', width=0.9, color=rgb.tue_blue, align='edge', zorder=3, alpha=0.5)
ax.bar(intervals, np.concatenate((all_data_women, zeroes_array), axis=0), label='Set1', width=0.9, color=rgb.tue_red, align='edge', zorder=3, alpha=0.5)

max_times_woman_all = np.flatnonzero(all_data_women == np.max(all_data_women)).tolist()
for index_times, times in enumerate(max_times_woman_all):
    max_times_woman_all[index_times] = f"{3*times // 12:02d}:{(3*times % 12) * 5:02d}"

max_times_man_all = np.flatnonzero(all_data_men == np.max(all_data_men)).tolist()
for index_times, times in enumerate(max_times_man_all):
    max_times_man_all[index_times] = f"{3*times // 12:02d}:{(3*times % 12) * 5:02d}"

men_average = np.round(np.average(np.arange(96), weights=all_data_men))
women_average = np.round(np.average(np.arange(96), weights=all_data_women))



ax.axvline(x=women_average, color=rgb.tue_red, linewidth=1, zorder=4, linestyle='dashed')
ax.errorbar(x=women_average, y=np.average(all_data_women), xerr=np.std(all_data_women), linewidth=1, capsize=6, zorder=4, color=rgb.tue_red, ecolor=rgb.tue_red)
ax.axvline(x=men_average+96, color=rgb.tue_blue, linewidth=1, zorder=4, linestyle='dashed')
ax.errorbar(x=men_average+96, y=np.average(all_data_men), xerr=np.std(all_data_men), linewidth=1, capsize=6, zorder=4, color=rgb.tue_blue, ecolor=rgb.tue_blue)

custom_lines = [Line2D([0], [0], color=rgb.tue_red, lw=2, alpha=0.5),
                Line2D([0], [0], color=rgb.tue_red, lw=2, linestyle='dashed'),
                Line2D([0], [0], color=rgb.tue_red, lw=2),
                Line2D([0], [0], color=rgb.tue_blue, lw=2, alpha=0.5),
                Line2D([0], [0], color=rgb.tue_blue, lw=2, linestyle='dashed'),
                Line2D([0], [0], color=rgb.tue_blue, lw=2)]
ax.legend(custom_lines, ['Female Students', 'Female Average', 'Female Std.', 'Male Students', 'Male Average', 'Male Std.'], prop={'size': 10}, loc='upper left')

fig.savefig(f'{plot_path}averaged_distribution.png')

In [None]:
# get general overview
print_number_of_individual_females_males_and_undefined()
# process data in df
df_individual_females, df_individual_males = create_data_for_individuals()
# remove cleaning lady
df_individual_females = df_individual_females.drop('Putzfrau')
# remove outliers
threshold = 10
df_individual_females = df_individual_females[
    (df_individual_females['active_time_in_weeks'] > 0) & (df_individual_females['avg_hours_per_week'] < threshold)]
df_individual_males = df_individual_males[
    (df_individual_males['active_time_in_weeks'] > 0) & (df_individual_males['avg_hours_per_week'] < threshold)]

print_min_and_max_active_times(df_individual_females, df_individual_males)

avg_hours_per_week_female = np.array(df_individual_females['avg_hours_per_week'])
avg_hours_per_week_male = np.array(df_individual_males['avg_hours_per_week'])
active_time_in_weeks_female = np.array(df_individual_females['active_time_in_weeks'])
active_time_in_weeks_male = np.array(df_individual_males['active_time_in_weeks'])

# create plots
plot_avg_washing_time(avg_hours_per_week_female, avg_hours_per_week_male, resolution)
plot_avg_washing_time_against_total_active_time(avg_hours_per_week_female, avg_hours_per_week_male,
                                                active_time_in_weeks_female, active_time_in_weeks_male, resolution)
plot_comparison_of_avg_washing_time(avg_hours_per_week_female, avg_hours_per_week_male, resolution)

# permutation test on mean and std
p_value_mean_difference = permutation_test(avg_hours_per_week_female, avg_hours_per_week_male, mean_difference)
print("p-value mean difference:", p_value_mean_difference)
p_value_std_difference = permutation_test(avg_hours_per_week_female, avg_hours_per_week_male, std_difference)
print("p-value std difference", p_value_std_difference)
