In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='darkgrid', font_scale = 1.5,
              rc={'figure.figsize':(7,5)})

rng = np.random.default_rng()

In [None]:
census = pd.read_csv("movie_census.csv")
census['Barbie'] = census['movie'] == 'Barbie'
census

In [None]:
actual_barbie = census["Barbie"].mean()
actual_barbie

In [None]:
undergrads = census[(18 <= census['age']) & (census['age'] <= 22)].sample(10, replace=False)
undergrads["Barbie"].mean()

In [None]:
len(undergrads)

In [None]:
print("Percent of Berkeley:", len(undergrads)/len(census) * 100)

In [None]:
elderly = census[census['age'] >= 65].sample(100)
elderly["Barbie"].mean()

In [None]:
len(elderly)

In [None]:
print("Percent of Berkeley:", len(elderly)/len(census)*100)

In [None]:
votes_by_barbie = (
    census
    .groupby(["age", "wears_birkenstocks"])
        .agg("mean", numeric_only=True)
    .reset_index())

votes_by_barbie

In [None]:
import plotly.express as px
px.scatter(votes_by_barbie, x="age", y="Barbie",
           color = "wears_birkenstocks",
           title = "Preferences by Demographics")

In [None]:
n = 2000
random_sample = census.sample(n, replace = False)

random_sample["Barbie"].mean()

In [None]:
actual_barbie

In [None]:
n = 800
random_sample = census.sample(n, replace = False)

sample_barbie = random_sample["Barbie"].mean()
err = abs(saample_barbie-actual_barbie)/actual_barbie

from IPython.display import Markdown
Markdown(f"**Actual** = {actual_barbie:.4f}, **Sample** = {sample_barbie:.4f}, "
        f"**Err** = {100*err:.2f}%.")

In [None]:
nrep = 1000
n = 800
poll_result = []
for i in range(0, nrep):
  random_sample = census.sample(n, replace=False)
  poll_result.append(random_sample["Barbie"].mean())

In [None]:
fig = px.histogram(poll_result, histnorm='probability density', nbins=50)
fig.add_vline(x=actual_barbie, line_width=3, line_dash="dash", line_color="orange")
fig.update_layout(showlegend=False)

from scipy import stats
from plotly import graph_objects as go
x = np.linspace(min(poll_result), max(poll_result), 100)
fig.add_trace(go.Scatter(
    x=x,
    y=stats.gaussian_kde(poll_result)(x),
    mode='lines', line=dict(color='red', width=3))
)

In [None]:
sns.histplot(poll_result, stat='density', kde=True);
plt.axvline(actual_barbie, color='orange', linestyle='dashed', linewidth=2)

In [None]:
poll_result = pd.Series(poll_result)
np.sum(poll_result > 0.5)/1000

In [None]:
np.random.multinomial(100,[0.60, 0.3, 0.1])

In [None]:
np.random.multinomial(100, [0.60, 0.30, 0.10], size=20)