In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("~/Desktop/thesis/sim/experiment/data/Adams_experiment.csv")

In [None]:
# Raw

df.head(24)

In [None]:
## FILTERING

# (a) what condition the participant was in (cond = 1 is gains)
# filter on cond=1 only
data = df.loc[df["cond"]==1, ["subject", "word", "s2_value", "in.cs", "order"]]

In [None]:
## ENRICHING

In [None]:
# create num_eval column
num_eval = data[data["in.cs"] == True].groupby("subject").count()["word"].rename("num_eval")

# could also create a num_eval column using the max of order
num_eval_using_max_order = data.groupby("subject").max()["order"]
print(sum(num_eval == num_eval_using_max_order))

num_eval = num_eval[num_eval == num_eval_using_max_order]

# where these are different, exclude that data
data = data.merge(num_eval, right_on="subject", left_on="subject")


In [None]:
# drop non-evaluated entries
data = data[data["in.cs"]]

In [None]:
# create best option evaluated so far column
data["highest_value_so_far"] = data.sort_values(["subject", "order"]).groupby("subject")["s2_value"].cummax()

In [None]:
# Create column whether that was their last one
data["did_continue_eval"] = ~(data["order"] == data["num_eval"])

In [None]:
# Exclude subjects that only went through the months
# Don't run this if you want to look at all subjects
"""
annual_order = ["JANUARY", "FEBRUARY", "MARCH", "APRIL", "MAY", "JUNE", "JULY", "AUGUST", "SEPTEMBER", "OCTOBER", "NOVEMBER", "DECEMBER"]
annual_order_df = pd.DataFrame.from_dict({"word": annual_order})
annual_order_df["annual_order"] = annual_order_df.index + 1

with_order = data.merge(annual_order_df)[["subject", "word", "order", "annual_order"]]
with_order["is_in_order"] = with_order["order"] == with_order["annual_order"]
subjects_stuck_to_annual_order = with_order.groupby("subject")[["is_in_order"]].all().reset_index()
data = data.merge(subjects_stuck_to_annual_order, how="left")
data = data[~data["is_in_order"]].sort_values(["subject", "order"])
"""

In [None]:
# Add column indicating the rank of the word considered (based on its s2_value)

word_values = data.groupby("word")["s2_value"].mean()
word_rank = word_values.sort_values(ascending=False).reset_index().drop(columns="s2_value")
word_rank["rank"] = word_rank.index + 1
data = data.merge(word_rank).rename(columns={"index": "rank"}).sort_values("subject")

In [None]:
# Exclude order numbers above 12
data = data[data["order"] <= 12]

In [None]:
# Filtered, enriched

data.head(24)

In [None]:
# Summary stats
data.groupby("did_continue_eval")["highest_value_so_far"].mean()

In [None]:
# How many months have which value?
# This is the distribution of options (all of them)
# across values (ex: 2 options with value < 5)

word_values = data.groupby("word")["s2_value"].mean()
minimum = min(word_values)
maximim = max(word_values)
bins = np.linspace(minimum, maximim, 10)
plt.hist(data.groupby("word")["s2_value"].mean(), bins)
plt.title("Distribution across 12 months")
plt.show()

In [None]:
# For a specific slice in evaluation (fixed number
# of options already considered), how does the value
# of the next option compare for those who
# stopped after evaluating this option vs those
# who continued?

print(f"total number of subjects {len(data.groupby('subject').mean().index)}")

for order_filter in range(1, 6):
    filtered_data = data[data["order"]==order_filter]

    word_values = data.groupby("word")["s2_value"].mean()
    minimum = min(word_values)
    maximim = max(word_values)

    non_last_eval = filtered_data.loc[filtered_data["did_continue_eval"], "s2_value"]
    last_eval = filtered_data.loc[~filtered_data["did_continue_eval"], "s2_value"]

    bins = np.linspace(minimum, maximim, 13)
    
    print(f"Number of data points: {len(filtered_data.index)}")

    plt.hist(non_last_eval, bins, alpha=0.5, label='continued evaluating', weights=np.ones(len(non_last_eval)) / len(non_last_eval))
    plt.hist(last_eval, bins, alpha=0.5, label='stopped', weights=np.ones(len(last_eval)) / len(last_eval))
    plt.legend(loc='upper left')
    plt.xlabel("Value of the action")
    plt.ylabel("Percentage of situations")
    plt.title(f"At evaluation number {order_filter}")
    plt.show()

In [None]:
# Now do the same thing for rank

for order_filter in range(1, 6):
    filtered_data = data[data["order"]==order_filter]
    non_last_eval = filtered_data.loc[filtered_data["did_continue_eval"], "rank"]
    last_eval = filtered_data.loc[~filtered_data["did_continue_eval"], "rank"]

    word_rank = data.groupby("word")["rank"].mean()
    minimum = min(word_rank)
    maximim = max(word_rank)
    bins = np.linspace(1, len(word_rank)+1, len(word_rank)+1)

    print(f"Number of data points: {len(filtered_data.index)}")

    plt.hist(non_last_eval, bins, alpha=0.5, label='continued evaluating', weights=np.ones(len(non_last_eval)) / len(non_last_eval))
    plt.hist(last_eval, bins, alpha=0.5, label='stopped', weights=np.ones(len(last_eval)) / len(last_eval))
    plt.legend(loc='upper left')
    plt.xlabel("Rank of option considered")
    plt.ylabel("Percentage of situations")
    plt.title(f"At evaluation number {order_filter}")
    plt.show()

In [None]:
# Across all options evaluated in different orders,
# what was the distribution? sorted by rank

rank_and_count = data.groupby("rank").count()[["word"]].rename(columns={"word": "count"}).reset_index()
word_and_rank = data.groupby("word").mean()[["rank"]].reset_index()
to_plot = rank_and_count.merge(word_and_rank).sort_values("rank")

fig = plt.figure(figsize=(12, 6))
plt.bar(to_plot["word"], height=to_plot["count"])
plt.title("Frequency of the rank of different options evaluated")
plt.xlabel("Rank")
plt.ylabel("Frequency")
plt.show()

In [None]:
# For a specific number of options already evaluated,
# what was the distribution? sorted by rank

for order in range(1, 12):

    rank_and_count = data[data["order"]==order].groupby("rank").count()[["word"]].rename(columns={"word": "count"}).reset_index()
    word_and_rank = data[data["order"]==order].groupby("word").mean()[["rank"]].reset_index()
    to_plot = rank_and_count.merge(word_and_rank).sort_values("rank")

    fig = plt.figure(figsize=(12, 6))
    plt.bar(to_plot["word"], height=to_plot["count"])
    plt.title(f"Frequency of the rank of different options evaluated for the {order} option evaluated")
    plt.xlabel("Rank")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# Looks like we're still getting a lot of people going through the months,
# I thought we had sorted those out?

february_second_subjects = data.loc[(data["order"]==2)&(data["word"]=="FEBRUARY"), ["subject"]]
february_second_subjects.merge(data).sort_values(["subject", "order"]).head(24)

# We did sort those out, but many who didn't perfectly stick to the order still followed it
# partly, see below

In [None]:
# Each line is the average for all subjects with the same total number
# of options evaluated. The y axis specifies the average s2 values for
# each number of options already evaluated. The lines are each one
# longer than the next because those with 6 options evaluated have
# one more datapoint than those that only evaluated 5 in total.

to_plot = data.groupby(["order", "num_eval"]).agg({"s2_value": "mean"}).reset_index()
to_plot = to_plot[(to_plot["order"] != np.NaN)]
sns.relplot(data=to_plot, x="order", y="s2_value", kind="line", hue="num_eval", height=8, aspect=11/8)
print("Value of options over the course of deliberation, grouped by number of options evaluated")

# It looks like we do have quite a considerable bump at the end of each line,
# which is strong support for the dynamic theory, that the overall downsloping
# (flat in this case) trend is superseded by the fact that subjects stopped
# being due to having found something good.

# I attribute the almost across the board bump down for the second option evaluated
# to February being frequently evaluated second (see previous charts)

In [None]:
# Now do the same thing for rank

to_plot = data.groupby(["order", "num_eval"]).agg({"rank": "mean"}).reset_index()
sns.relplot(data=to_plot, x="order", y="rank", kind="line", hue="num_eval", height=8, aspect=11/8)
print("Rank of options over the course of deliberation, grouped by number of options evaluated")

In [None]:
# Let's quantify how much the subjects gain in value in the last option
# they evaluate

rank_or_s2value = "s2_value"

to_plot = data.groupby(["order", "num_eval"]).agg({rank_or_s2value: "mean"}).reset_index()
final_option = to_plot[to_plot["order"]==to_plot["num_eval"]].rename(columns={rank_or_s2value: "final"}).drop(columns=["order"])
to_plot_with_final = to_plot.merge(final_option, how="left")
second_to_last_option = to_plot_with_final[to_plot_with_final["order"]==to_plot_with_final["num_eval"]-1]

percentage_increase = second_to_last_option.assign(percentage_increase=((second_to_last_option["final"]/second_to_last_option[rank_or_s2value])-1)*100)
plt.bar(x=percentage_increase["num_eval"], height=percentage_increase["percentage_increase"])
plt.xlabel("Total number of options evaluated")
plt.ylabel(f"Percentage difference in {rank_or_s2value}")
plt.show()

difference = second_to_last_option.assign(percentage_increase=second_to_last_option["final"]-second_to_last_option[rank_or_s2value])
plt.bar(x=difference["num_eval"], height=difference["percentage_increase"])
plt.xlabel("Total number of options evaluated")
plt.ylabel(f"Absolute difference in {rank_or_s2value}")
plt.show()

In [None]:
# now do the same thing for rank

rank_or_s2value = "rank"

to_plot = data.groupby(["order", "num_eval"]).agg({rank_or_s2value: "mean"}).reset_index()
final_option = to_plot[to_plot["order"]==to_plot["num_eval"]].rename(columns={rank_or_s2value: "final"}).drop(columns=["order"])
to_plot_with_final = to_plot.merge(final_option, how="left")
second_to_last_option = to_plot_with_final[to_plot_with_final["order"]==to_plot_with_final["num_eval"]-1]

percentage_increase = second_to_last_option.assign(percentage_increase=((second_to_last_option["final"]/second_to_last_option[rank_or_s2value])-1)*100)
plt.bar(x=percentage_increase["num_eval"], height=percentage_increase["percentage_increase"])
plt.xlabel("Total number of options evaluated")
plt.ylabel(f"Percentage difference in {rank_or_s2value}")
plt.show()

difference = second_to_last_option.assign(percentage_increase=second_to_last_option["final"]-second_to_last_option[rank_or_s2value])
plt.bar(x=difference["num_eval"], height=difference["percentage_increase"])
plt.xlabel("Total number of options evaluated")
plt.ylabel(f"Absolute difference in {rank_or_s2value}")
plt.show()