In [2]:
import pandas as pd
import altair as alt
import math
import scipy.stats

In [3]:
data = pd.read_csv("Group2_Dataset.csv")

In [4]:
data = data.drop(0)

Converting the data types from strings to numbers in the csv

In [5]:
#data["LightModeTime"] = pd.to_numeric(data["LightModeTime"])
#data["DarkModeTime"] = pd.to_numeric(data["DarkModeTime"])

columnNames = ["LightModeTime", "DarkModeTime", "LightModeAccuracy", "DarkModeAccuracy", "OverallDarkComfortScore", "OverallLightComfortScore"]

for colName in columnNames:
    data[colName] = pd.to_numeric(data[colName])

Dropped the first row as it has all of the question descriptions from Qualtrics

In [7]:
data["time_diff"] = data["DarkModeTime"] - data["LightModeTime"]
data["accuracy_diff"] = data["DarkModeAccuracy"] - data["LightModeAccuracy"]
data["comfort_diff"] = data["OverallDarkComfortScore"] - data["OverallLightComfortScore"]

In [21]:
darkModeTimeSTD = data.DarkModeTime.std()
darkModeTimeMean = data.DarkModeTime.mean()

lightModeTimeSTD = data.LightModeTime.std()
lightModeTimeMean = data.LightModeTime.mean()

darkModeAccuracySTD = data.DarkModeAccuracy.std()
darkModeAccuracyMean = data.DarkModeAccuracy.mean()

lightModeAccuracySTD = data.LightModeAccuracy.std()
lightModeAccuracyMean = data.LightModeAccuracy.mean()

darkModeComfortSTD = data.OverallDarkComfortScore.std()
darkModeComfortMean = data.OverallDarkComfortScore.mean()

lightModeComfortSTD = data.OverallLightComfortScore.std()
lightModeComfortMean = data.OverallLightComfortScore.mean()

Checking for outliers (more than 3 standard deviations away from the mean):

In [22]:
data = data[(data["DarkModeTime"] <= (darkModeTimeMean + 3*darkModeTimeSTD))]
data = data[(data["DarkModeTime"] >= (darkModeTimeMean - 3*darkModeTimeSTD))]

data = data[(data["LightModeTime"] <= (lightModeTimeMean + 3*lightModeTimeSTD))]
data = data[(data["LightModeTime"] >= (lightModeTimeMean - 3*lightModeTimeSTD))]


data = data[(data["DarkModeAccuracy"] <= (darkModeAccuracyMean + 3*darkModeAccuracySTD))]
data = data[(data["DarkModeAccuracy"] >= (darkModeAccuracyMean - 3*darkModeAccuracySTD))]

data = data[(data["LightModeAccuracy"] <= (lightModeAccuracyMean + 3*lightModeAccuracySTD))]
data = data[(data["LightModeAccuracy"] >= (lightModeAccuracyMean - 3*lightModeAccuracySTD))]


data = data[(data["OverallDarkComfortScore"] <= (darkModeComfortMean + 3*darkModeComfortSTD))]
data = data[(data["OverallDarkComfortScore"] >= (darkModeComfortMean - 3*darkModeComfortSTD))]

data = data[(data["OverallLightComfortScore"] <= (lightModeComfortMean + 3*lightModeComfortSTD))]
data = data[(data["OverallLightComfortScore"] >= (lightModeComfortMean - 3*lightModeComfortSTD))]


In [59]:
data

Unnamed: 0,ParticipantId,Experimenter,Informed Consent,LightModeTime,LightModeAccuracy,DarkModeTime,DarkModeAccuracy,Comfortability_1,Comfortability_2,Comfortability_3,Comfortability_4,Comfortability_5,OverallDarkComfortScore,OverallLightComfortScore,time_diff,accuracy_diff,comfort_diff
1,1,Team,Yes,76,0.96,75,0.97,Somewhat agree,Somewhat disagree,Agree,Agree,Somewhat disagree,7,13,-1,0.01,-6
2,2,Team,Yes,62,0.97,60,0.96,Agree,Agree,Agree,Agree,Agree,14,10,-2,-0.01,4
3,3,Team,Yes,77,0.96,72,0.97,Strongly agree,Strongly agree,Strongly agree,Strongly agree,Agree,16,12,-5,0.01,4
4,4,Team,Yes,76,0.97,98,0.92,Somewhat disagree,Somewhat agree,Agree,Neither agree nor disagree,Somewhat agree,9,9,22,-0.05,0
5,5,Team,Yes,72,0.96,77,0.96,Strongly agree,Strongly agree,Agree,Agree,Agree,16,10,5,0.0,6
6,6,Team,Yes,86,0.96,88,0.96,Strongly agree,Somewhat agree,Strongly agree,Strongly agree,Neither agree nor disagree,12,14,2,0.0,-2
7,7,Team,Yes,56,0.97,54,0.96,Somewhat disagree,Somewhat disagree,Strongly agree,Strongly agree,Somewhat disagree,5,15,-2,-0.01,-10
8,8,Team,Yes,53,0.95,53,0.94,Strongly agree,Agree,Strongly agree,Strongly agree,Disagree,11,16,0,-0.01,-5
9,9,Team,Yes,68,0.96,79,0.97,Neither agree nor disagree,Disagree,Somewhat disagree,Strongly agree,Somewhat agree,7,9,11,0.01,-2
10,10,Team,Yes,71,0.96,73,0.95,Strongly agree,Strongly agree,Somewhat agree,Somewhat agree,Strongly agree,17,7,2,-0.01,10


# Hypothesis 1: Using Dark Mode will result in lower task completion times

$H_0: \mu_D = \mu_L$

$H_A: \mu_D < \mu_L$

## Assumption Checks

In [23]:
scipy.stats.shapiro(data.time_diff)

ShapiroResult(statistic=0.9228260517120361, pvalue=0.035977207124233246)

Assumption was met because we got a p-value of 0.036 which is greater than our adjusted $\alpha$ of 0.017.

In [49]:
alt.Chart(data, title="Histogram of change in time of completion").mark_bar().encode(
    x=alt.X("time_diff", bin=True, title="Change in time completion"),
    y="count()"
)

# Descriptive Statistics

In [25]:
darkModeTimeSTD = data.DarkModeTime.std()
darkModeTimeMean = data.DarkModeTime.mean()

lightModeTimeSTD = data.LightModeTime.std()
lightModeTimeMean = data.LightModeTime.mean()

print("Dark: mean = %.2f, SD = %.2f" % (darkModeTimeMean, darkModeTimeSTD))
print("Light: mean = %.2f, SD = %.2f" % (lightModeTimeMean, lightModeTimeSTD))

Dark: mean = 70.17, SD = 16.41
Light: mean = 68.21, SD = 14.59


# One-Tailed Paired Samples T-test

In [58]:
scipy.stats.ttest_rel(data.DarkModeTime, data.LightModeTime, alternative='less')

Ttest_relResult(statistic=1.4816444479936035, pvalue=0.9251988711048994)

Since we have an overall $\alpha$ of 0.05 and we are running 3 total tests, we need to correct for multiple comparisons by doing Bonferroni's correction to find an adjusted $\alpha$ of $0.05 / 3$. The p-value of 0.925 is above our corrected $\alpha$ of 0.017, so we do not have statistically significant results.

# Effect Size

In [27]:
cohen_d_time = data.time_diff.mean() / data.time_diff.std()
cohen_d_time

0.27513446682832354

This means that there is only a small effect because the value of 0.275 is in the range between 0.2 and 0.5 and this is classified as the small range

# Confidence Intervals

In [37]:
bounds_time = scipy.stats.norm.interval(alpha=0.95, loc=data.time_diff.mean(), scale=scipy.stats.sem(data.time_diff))
bounds_time

(-0.6345282751497667, 4.5655627579083875)

This tells us that 95% of the time, the true mean difference between the times for typing 100 words in light and dark mode falls between -0.63 and 4.57 units. This lines up with the fact that we didn't reject the null hypothesis (since we didn't get a statistically significant result) because 0 is within this range, meaning there could be no difference in dark and light mode typing times. If there's no difference, then the mean time for dark mode isn't less than the mean time for light mode.

# Data Visualization

In [52]:
alt.Chart(data).mark_bar().encode(
    x='ParticipantId',
    y='LightModeTime'
)

In [53]:
alt.Chart(data).mark_bar().encode(
    x='ParticipantId',
    y='DarkModeTime'
)

# Hypothesis 2: Using Dark Mode will result in higher accuracy scores

$H_0: \mu_{AD} = \mu_{AL}$

$H_A: \mu_{AD} > \mu_{AL}$

# Assumption Checks

In [31]:
scipy.stats.shapiro(data.accuracy_diff)

ShapiroResult(statistic=0.9397556185722351, pvalue=0.09880173206329346)

Assumption was met because we got a p-value of 0.099 which is greater than our adjusted $\alpha$ of 0.017

In [32]:
alt.Chart(data, title="Histogram of change in typing accuracy").mark_bar().encode(
    x=alt.X("accuracy_diff", bin=True, title="Change in typing accuracy"),
    y="count()"
)

# Descriptive Statistics

In [33]:
darkModeAccuracySTD = data.DarkModeAccuracy.std()
darkModeAccuracyMean = data.DarkModeAccuracy.mean()

lightModeAccuracySTD = data.LightModeAccuracy.std()
lightModeAccuracyMean = data.LightModeAccuracy.mean()

print("Dark: mean = %.2f, SD = %.2f" % (darkModeAccuracyMean, darkModeAccuracySTD))
print("Light: mean = %.2f, SD = %.2f" % (lightModeAccuracyMean, lightModeAccuracySTD))

Dark: mean = 0.94, SD = 0.03
Light: mean = 0.94, SD = 0.03


# One-Tailed Paired Sample T-Test

In [35]:
scipy.stats.ttest_rel(data.DarkModeAccuracy, data.LightModeAccuracy, alternative='greater')

Ttest_relResult(statistic=-0.07304358872384731, pvalue=0.5288546328856495)

The p-value of 0.529 is greater than our adjusted $\alpha$ of 0.017, so we do not have statistically significant results and we fail to reject the null hypothesis that there is no difference in the mean accuracy scores between light and dark mode.

# Effect Size

In [36]:
cohen_d_accuracy = data.accuracy_diff.mean() / data.accuracy_diff.std()
cohen_d_accuracy

-0.013563853909740315

This means there is only a small effect on accuracy because the magnitude of Cohen's d for the accuracy is 0.014 which is in the small range.

# Confidence Intervals

In [39]:
bounds_accuracy = scipy.stats.norm.interval(alpha=0.95, loc=data.accuracy_diff.mean(), scale=scipy.stats.sem(data.accuracy_diff))
bounds_accuracy

(-0.009597517132945267, 0.008907861960531475)

This tells us that 95% of the time, the true mean difference in accuracy scores for both light and dark mode will fall between -0.0096 and 0.0089 units. This lines up with the fact that we didn't reject the null hypothesis (since we didn't get a statistically significant result) because 0 is within this range, meaning there could be no difference in dark and light mode accuracy scores. If there's no difference, then the mean accuracy for dark mode isn't greater than the mean accuracy for light mode.

# Data Visualization

In [54]:
alt.Chart(data).mark_bar().encode(
    x='ParticipantId',
    y='LightModeAccuracy'
)

In [55]:
alt.Chart(data).mark_bar().encode(
    x='ParticipantId',
    y='DarkModeAccuracy'
)

# Hypothesis 3: Dark Mode will have a higher perceived comfort score

$H_0: \mu_{CD} = \mu_{CL}$

$H_A: \mu_{CD} > \mu_{CL}$

In [40]:
scipy.stats.shapiro(data.comfort_diff)

ShapiroResult(statistic=0.9796977639198303, pvalue=0.8304286599159241)

Assumption was met because we got a p-value of 0.83 which is greater than our adjusted $\alpha$ of 0.017

In [42]:
alt.Chart(data, title="Histogram of change in perceived comfort score").mark_bar().encode(
    x=alt.X("comfort_diff", bin=True, title="Change in perceived comfort score"),
    y="count()"
)

# Descriptive Statistics

In [43]:
darkModeComfortSTD = data.OverallDarkComfortScore.std()
darkModeComfortMean = data.OverallDarkComfortScore.mean()

lightModeComfortSTD = data.OverallLightComfortScore.std()
lightModeComfortMean = data.OverallLightComfortScore.mean()

print("Dark: mean = %.2f, SD = %.2f" % (darkModeComfortMean, darkModeComfortSTD))
print("Light: mean = %.2f, SD = %.2f" % (lightModeComfortMean, lightModeComfortSTD))

Dark: mean = 11.97, SD = 4.14
Light: mean = 10.83, SD = 3.61


# One-Tailed Paired Sample T-Test

In [44]:
scipy.stats.ttest_rel(data.OverallDarkComfortScore, data.OverallLightComfortScore, alternative='greater')

Ttest_relResult(statistic=0.8853336897019269, pvalue=0.19175846073217295)

The p-value of 0.19 is greater than our adjusted $\alpha$ of 0.017, so we do not have statistically significant results and we fail to reject the null hypothesis that there is no difference in the mean perceived comfort scores between light and dark mode.

# Effect Size

In [45]:
cohen_d_comfort = data.comfort_diff.mean() / data.comfort_diff.std()
cohen_d_comfort

0.1644023389087364

This means there is only a small effect on perceived comfort score because the magnitude of Cohen's d for the perceived comfort score is 0.1644 which is in the small range.

# Confidence Intervals

In [46]:
bounds_comfort = scipy.stats.norm.interval(alpha=0.95, loc=data.comfort_diff.mean(), scale=scipy.stats.sem(data.comfort_diff))
bounds_comfort

(-1.3812364505222565, 3.657098519487774)

This tells us that 95% of the time, the true mean difference in the mean comfort scores between light and dark mode will fall between -1.38 and 3.66 units. This lines up with the fact that we didn't reject the null hypothesis (since we didn't get a statistically significant result) because 0 is within this range, meaning there could be no difference in dark and light mode perceived scores. If there's no difference, then the mean perceived comfort score for dark mode isn't greater than the mean perceived comfort score for light mode. 

# Data Visualization

In [56]:
alt.Chart(data).mark_bar().encode(
    x='ParticipantId',
    y='OverallLightComfortScore'
)

In [57]:
alt.Chart(data).mark_bar().encode(
    x='ParticipantId',
    y='OverallDarkComfortScore'
)