In [1]:
"""
More on rpy2 options
https://ipython.org/ipython-doc/2/config/extensions/rmagic.html
"""
import rpy2
%load_ext rpy2.ipython



# 1. DEVORESTAT9 5.E.037.
A particular brand of dishwasher soap is sold in three sizes: 30 oz, 40 oz, and 70 oz. Twenty percent of all purchasers select a 30-oz box, 50% select a 40-oz box, and the remaining 30% choose a 70-oz box. Let X1 and X2 denote the package sizes selected by two independently selected purchasers.


### (a) Determine the sampling distribution of X.

In [2]:
p1_dict = {30: .2,
           40: .5,
           70: .3}

print("Dict given")
for x, y in p1_dict.items():
    print(x, y)

Dict given
30 0.2
40 0.5
70 0.3


In [3]:
import itertools
import numpy as np
import functools
import pandas as pd

from typing import Tuple

np.set_printoptions(suppress=True)


COLUMN_NAMES_GIVEN = [
    "Pair", "P(Pair)", "X bar (Sample Mean)", "S^2 (Variance)"]


def create_tables_pd_df_and_np_array(dict_given, combination_size, column_names_given=COLUMN_NAMES_GIVEN) -> Tuple[np.array, pd.DataFrame]:

    # Dict that has the Mean as the key and easy multiplication of the probabilities as the Value
    dict_temp = {}

    # Current row
    list_row = []

    # You don't want combination because you can't repeat (meaning you can't have yourself) and you won't get double the same key (Example: X, X and X, X)
#     list_pair_keys = list(itertools.combinations(p1_dict, combination_size))

    # You don't want combinations_with_replacement because you won't get double the same key (Example: X, X and X, X)
#     list_pair_keys = list(itertools.combinations_with_replacement(p1_dict, combination_size))

    # You want the carteisan product so you can get the same key twice and same pair (not inorder) twice excluding its self twice
    list_pair_keys = list(itertools.product(p1_dict, repeat=combination_size))

    for pair_keys in list_pair_keys:

        row_temp = []  # Current row

        pair_value = []  # Pair of probabilities based on the given dictionary

        key_mean = np.mean(pair_keys)  # Mean of the pair of keys

        for key in pair_keys:
            pair_value.append(dict_given[key])

        # Means are unique so they can be a key
        dict_temp[key_mean] = functools.reduce(lambda x, y: x*y, pair_value)

        sd = np.sum(
            [(key_current - key_mean)**2 for key_current in pair_keys]) / (len(pair_keys) - 1)

        row_temp.append(pair_keys)
        row_temp.append(dict_temp[key_mean])
        row_temp.append(key_mean)
        row_temp.append(sd)

        list_row.append(row_temp)

    # Numpy array
    np_array_temp = np.array(list_row, dtype=object)

    # Pandas Dataframe
    pd_df_temp = pd.DataFrame(np_array_temp, columns=column_names_given)

    return pd_df_temp, np_array_temp


# column_index=2 is the mean part of the table
def sum_table_based_on_column(table, column_index=2):

    pd_df_temp = table[0]  # The pd df

    np_array_temp = table[1]  # The np array

    dict_temp = {}  # Dict where the Mean is the Key and it's corresponding row is the Value

    for row in np_array_temp:

        # Remember that mean is unique
        mean_value = row[2]

        if dict_temp.get(mean_value) is None:
            dict_temp[mean_value] = row
            continue

        dict_temp[mean_value][1] += row[1]

    # You need to make a list because np.fromiter does not work with dtype=object apparently...
    list_temp = [value for key, value in dict_temp.items()]

    np_array_temp_new = np.array(list_temp, dtype=object)

    pd_df_temp_new = pd.DataFrame(
        np_array_temp_new, columns=pd_df_temp.columns)

    return pd_df_temp_new, np_array_temp_new

In [4]:
print("Dict given (Size, Percent of purchasers)")
for x, y in p1_dict.items():
    print(f"{x}, {y}")
print()
print()

p1a_tables = create_tables_pd_df_and_np_array(p1_dict, 2)

p1a_pd_df = p1a_tables[0]
p1a_np_array = p1a_tables[1]

print("Pandas DF of dict given (package sizes selected by two independently selected purchasers)")
print()
print(p1a_pd_df)
print()
# print(p1a_np_array)
print()

p1a_tables_simplified = sum_table_based_on_column(p1a_tables)

p1a_pd_df_simplified = p1a_tables_simplified[0]
p1a_np_array_simplified = p1a_tables_simplified[1]

print("Pandas DF of dict given but simplified by adding the row's index 1 based on the row's index 2 \n(package sizes selected by two independently selected purchasers)")
print()
print(p1a_pd_df_simplified)
print()
# print(p1a_np_array_simplified)
print()

Dict given (Size, Percent of purchasers)
30, 0.2
40, 0.5
70, 0.3


Pandas DF of dict given (package sizes selected by two independently selected purchasers)

       Pair P(Pair) X bar (Sample Mean) S^2 (Variance)
0  (30, 30)    0.04                  30              0
1  (30, 40)     0.1                  35             50
2  (30, 70)    0.06                  50            800
3  (40, 30)     0.1                  35             50
4  (40, 40)    0.25                  40              0
5  (40, 70)    0.15                  55            450
6  (70, 30)    0.06                  50            800
7  (70, 40)    0.15                  55            450
8  (70, 70)    0.09                  70              0


Pandas DF of dict given but simplified by adding the row's index 1 based on the row's index 2 
(package sizes selected by two independently selected purchasers)

       Pair P(Pair) X bar (Sample Mean) S^2 (Variance)
0  (30, 30)    0.04                  30              0
1  (30, 40)     0.

#### Calculate E(X) in oz
E(X) =

In [5]:
# Expected value is Summation of probability AKA P(pair) * X bar
p1a_np_array = np.array([row[1] * row[2]
                         for column_names, row in p1a_pd_df_simplified.iterrows()])
print(np.vstack(p1a_np_array))
print()

p1a_expected_value = np.sum(p1a_np_array)

print(f"E(X) = {p1a_expected_value} oz")

[[ 1.2]
 [ 7. ]
 [ 6. ]
 [10. ]
 [16.5]
 [ 6.3]]

E(X) = 47.0 oz


#### Compare E(X) to μ. (Sample mean to Population mean)
<img src="images/5_3_and_5_4_problem_1a_solution.png" alt="5_3_and_5_4_problem_1a_solution.png" style="float:left;">

### (b) Determine the sampling distribution of the sample variance S2.
E(S^2) =

In [6]:
print("Recall")
print("Pandas DF of dict given but simplified by adding the row's index 1 based on the row's inedx 2 \n(package sizes selected by two independently selected purchasers)")
print()
print(p1a_pd_df_simplified)
print()

Recall
Pandas DF of dict given but simplified by adding the row's index 1 based on the row's inedx 2 
(package sizes selected by two independently selected purchasers)

       Pair P(Pair) X bar (Sample Mean) S^2 (Variance)
0  (30, 30)    0.04                  30              0
1  (30, 40)     0.2                  35             50
2  (30, 70)    0.12                  50            800
3  (40, 40)    0.25                  40              0
4  (40, 70)     0.3                  55            450
5  (70, 70)    0.09                  70              0



In [7]:
# Expected value is Summation of probability AKA P(pair) * S^2
p1b_np_array = np.array([row[1] * row[3]
                         for column_names, row in p1a_pd_df_simplified.iterrows()])
print(np.vstack(p1b_np_array))
print()

p1b_expected_value = np.sum(p1b_np_array)

print(f"E(S^2) = {p1b_expected_value}")

[[  0.]
 [ 10.]
 [ 96.]
 [  0.]
 [135.]
 [  0.]]

E(S^2) = 241.0


#### Compare E(S^2) to σ^2. (Sample variance to Population variance)
<img src="images/5_3_and_5_4_problem_1b_solution.png" alt="5_3_and_5_4_problem_1b_solution.png" style="float:left;">

# Know before Problem 2
<img src="images/5_3_and_5_4_sample_mean_and_sample_total.png" alt="5_3_and_5_4_sample_mean_and_sample_total.png" style="float:left;">

# 2. DEVORESTAT9 5.E.046.

Young's modulus is a quantitative measure of stiffness of an elastic material. Suppose that for aluminum alloy sheets of a particular type, its mean value and standard deviation are 70 GPa and 1.6 GPa, respectively (values given in the article "Influence of Material Properties Variability on Springback and Thinning in Sheet Stamping Processes: A Stochastic Analysis" (Intl. J. of Advanced Manuf. Tech., 2010: 117–134)).


In [8]:
p2_mean = 70  # GPa
p2_SD = 1.6  # GPa

### (a) If X is the sample mean Young's modulus for a random sample of n = 64 sheets, where is the sampling distribution of X centered, and what is the standard deviation of the X distribution?

In [9]:
p2a_n = 64

#### E(X bar) =

In [10]:
p2a_sample_expected_value = p2_mean
print(f"E(X bar) = {p2a_sample_expected_value}")

E(X bar) = 70


#### σ_(X bar)=

In [11]:
p2a_sample_SD = p2_SD / p2a_n**(1/2)
print(f"σ_(X bar) = {p2a_sample_SD}")

σ_(X bar) = 0.2


### (b) Answer the questions posed in part (a) for a sample size of n = 256 sheets.

In [12]:
p2b_n = 256

#### E(X bar) =

In [13]:
p2b_sample_expected_value = p2_mean
print(f"E(X bar) = {p2b_sample_expected_value}")

E(X bar) = 70


#### σ_(X bar)=

In [14]:
p2b_sample_SD = p2_SD / p2b_n**(1/2)
print(f"σ_(X bar) = {p2b_sample_SD}")

σ_(X bar) = 0.1


### (c) For which of the two random samples, the one of part (a) or the one of part (b), is X more likely to be within 1 GPa of 70 GPa? Explain your reasoning.
<img src="images/5_3_and_5_4_problem_2c_solution.png" alt="5_3_and_5_4_problem_2c_solution.png" style="float:left;">

# Know before Problem 3
<img src="images/5_3_and_5_4_sample_mean_and_sample_total.png" alt="5_3_and_5_4_sample_mean_and_sample_total.png" style="float:left;">

# 3. DEVORESTAT9 5.E.049.S.
There are 43 students in an elementary statistics class. On the basis of years of experience, the instructor knows that the time needed to grade a randomly chosen first examination paper is a random variable with an expected value of 5 min and a standard deviation of 4 min. (Round your answers to four decimal places.)


In [15]:
p3_students = 43
p3_EV = 5
p3_SD = 4

print("Time approximately will be done")
print(p3_students*p3_EV)

Time approximately will be done
215


### (a) If grading times are independent and the instructor begins grading at 6:50 P.M. and grades continuously, what is the (approximate) probability that he is through grading before the 11:00 P.M. TV news begins?


In [16]:
def time_string_to_time_minutes(string_given):
    list_temp = string_given.split(":")

    hour_to_min = int(list_temp[0]) * 60
    min = int(list_temp[1])

    total_time = hour_to_min + min
    return total_time


p3a_time_start_string = "6:50"
p3a_time_start = time_string_to_time_minutes(p3a_time_start_string)

p3a_time_end_string = "11:00"
p3a_time_end = time_string_to_time_minutes(p3a_time_end_string)

print(
    f"Time difference (in minutes) between {p3a_time_start_string} and {p3a_time_end_string} is")
p3a_delta_time = p3a_time_end - p3a_time_start
print(p3a_delta_time)
print()

# sample mean = 250 / 43
p3a_sample_mean = p3a_delta_time / p3_students
print("Sample mean")
print(p3a_sample_mean)
print()

# sample SD = population SD / sqrt(n) = 4 / sqrt(43)
p3a_sample_SD = (p3_SD / (p3_students**(1/2)))
print("Sample SD")
print(p3a_sample_SD)
print()

Time difference (in minutes) between 6:50 and 11:00 is
250

Sample mean
5.813953488372093

Sample SD
0.6099942813304187



In [17]:
%%R -i p3a_sample_mean -i p3_EV -i p3a_sample_SD

# Recall that pnorm finds cumulative distribution function of the PDF

# V1 (numbers)
print(pnorm(5.813953488372093, 5, 0.6099942813304187))

# V2 (passed variables)
print(pnorm(p3a_sample_mean, p3_EV, p3a_sample_SD))

# V3 (1 Parameter)
print(pnorm((p3a_sample_mean - p3_EV) / p3a_sample_SD))

# Alternative form
print(pnorm((250 - 43 * 5) / sqrt(43 * 4 ^ 2)))

[1] 0.9089575
[1] 0.9089575
[1] 0.9089575
[1] 0.9089575


### (b) If the sports report begins at 11:10, what is the probability that he misses part of the report if he waits until grading is done before turning on the TV?


In [18]:
p3b_time_end_string = "11:10"
p3b_time_end = time_string_to_time_minutes(p3b_time_end_string)

print(
    f"Time difference (in minutes) between {p3a_time_start_string} and {p3b_time_end_string} is")
p3b_delta_time = p3b_time_end - p3a_time_start
print(p3b_delta_time)
print()

# sample mean = 260 / 43
p3b_sample_mean = p3b_delta_time / p3_students
print("Sample mean")
print(p3b_sample_mean)
print()

# sample SD = population SD / sqrt(n) = 4 / sqrt(43)
p3b_sample_SD = (p3_SD / (p3_students**(1/2)))
print("Sample SD (Same as above)")
print(p3a_sample_SD)
print()

Time difference (in minutes) between 6:50 and 11:10 is
260

Sample mean
6.046511627906977

Sample SD (Same as above)
0.6099942813304187



In [19]:
%%R -i p3b_sample_mean -i p3_EV -i p3b_sample_SD

# The Key word is "missing" which means the right tail

# V1 (numbers)
print(1 - pnorm(6.046511627906977, 5, 0.6099942813304187))

# V2 (passed variables)
print(1 - pnorm(p3b_sample_mean, p3_EV, p3b_sample_SD))

# V3 (1 Parameter)
print(1 - pnorm((p3b_sample_mean - p3_EV) / p3b_sample_SD))

# Alternative form
print(1 - pnorm((260 - 43 * 5) / sqrt(43 * 4 ^ 2)))

[1] 0.04311682
[1] 0.04311682
[1] 0.04311682
[1] 0.04311682


# Know before Problem 4
<img src="images/5_3_and_5_4_sample_mean_and_sample_total.png" alt="5_3_and_5_4_sample_mean_and_sample_total.png" style="float:left;">

# 4. DEVORESTAT9 5.E.047.S.
Young's modulus is a quantitative measure of stiffness of an elastic material. Suppose that for metal sheets of a particular type, its mean value and standard deviation are 70 GPa and 2.2 GPa, respectively. Suppose the distribution is normal. (Round your answers to four decimal places.)


In [20]:
p4_population_mean = 70  # GPa
p4_population_sd = 2.2  # GPa

### (a) Calculate P(69 ≤ X ≤ 71) when n = 9.

In [21]:
p4a_lower_bound = 69
p4a_upper_bound = 71
p4a_n = 9

In [22]:
# Sample_sd = 2.2 / (9)**(1/2)
p4a_sample_sd = p4_population_sd / (p4a_n)**(1/2)
print(p4a_sample_sd)

0.7333333333333334


In [23]:
%%R -i  p4a_sample_sd -i p4a_lower_bound -i p4a_upper_bound -i p4_population_mean

# V1 (Upper bound - Lower bound)
print(pnorm(71, 70, 0.7333333333333334) - pnorm(69, 70, 0.7333333333333334))

# V2 (Upper bound - Lower bound)
print(pnorm(p4a_upper_bound, p4_population_mean, p4a_sample_sd) -
      pnorm(p4a_lower_bound, p4_population_mean, p4a_sample_sd))

[1] 0.827318
[1] 0.827318


### (b) How likely is it that the sample mean diameter exceeds 71 when n = 25?

In [24]:
p4b_n = 25
p4b_upper_bound = 71

In [25]:
# Sample_sd = 2.2 / (25)**(1/2)
p4b_sample_sd = p4_population_sd / (p4b_n)**(1/2)

print("Sample SD")
print(p4b_sample_sd)

Sample SD
0.44000000000000006


In [26]:
%%R -i  p4b_sample_sd -i p4a_upper_bound -i p4_population_mean

# V1 (Upper bound - Lower bound)
print(1 - pnorm(71, 70, 0.44000000000000006))

# V2 (Upper bound - Lower bound)
print(1 - pnorm(p4a_upper_bound, p4_population_mean, p4b_sample_sd))

[1] 0.01152131
[1] 0.01152131


# Know before Problem 5
<img src="images/5_3_and_5_4_sample_mean_and_sample_total.png" alt="5_3_and_5_4_sample_mean_and_sample_total.png" style="float:left;">

<a href="https://youtu.be/2tuBREK_mgE?t=201"> <h3> Standardization Formula</h3> </a>
<img src="images/simple_learning_pro_standardization_formula.png" alt="simple_learning_pro_standardization_formula.png" style="float:left;">

# 5. DEVORESTAT9 5.E.047.S.
Suppose the sediment density (g/cm) of a randomly selected specimen from a certain region is normally distributed with mean 2.61 and standard deviation 0.81.


In [27]:
p5_population_mean = 2.61
p5_population_sd = 0.81

### (a) If a random sample of 25 specimens is selected, what is the probability that the sample average sediment density is at most 3.00? Between 2.61 and 3.00? (Round your answers to four decimal places.) at most 3.00   

In [28]:
p5a_sample_size = 25
p5a_avg_sediment_density = 3

p5a_upper_bound = p5a_avg_sediment_density
p5a_lower_bound = p5_population_mean

In [29]:
p5a_sample_sd = p5_population_sd / (p5a_sample_size)**(1/2)

print("Sample SD")
print(p5a_sample_sd)

Sample SD
0.162


#### at most 3.00 

In [30]:
%%R -i p5a_avg_sediment_density -i p5_population_mean -i p5a_sample_sd

# V1
print(pnorm(3, 2.61, .81 / sqrt(25)))

# V2
print(pnorm(3, 2.61, 0.162))

# V3
print(pnorm(p5a_avg_sediment_density, p5_population_mean, p5a_sample_sd))

[1] 0.9919669
[1] 0.9919669
[1] 0.9919669


#### between 2.61 and 3.00

In [31]:
%%R -i p5a_avg_sediment_density -i p5_population_mean -i p5a_sample_sd

# V1
print(pnorm(3, 2.61, .81/sqrt(25)) - pnorm(2.61, 2.61, .81/sqrt(25)))

# V2
print(pnorm(3, 2.61, 0.162) - pnorm(2.61, 2.61, 0.162))

# V3
print(pnorm(p5a_avg_sediment_density, p5_population_mean, p5a_sample_sd) -
      pnorm(p5_population_mean, p5_population_mean, p5a_sample_sd))

[1] 0.4919669
[1] 0.4919669
[1] 0.4919669


### (b) How large a sample size would be required to ensure that the first probability in part (a) is at least 0.99? (Round your answer up to the nearest whole number.)


In [32]:
p5b_probability = .99

In [33]:
%%R -i p5b_probability

# Recall this

qnorm_result <- qnorm(p5b_probability)
cat("qnorm(.99) =", qnorm_result)
cat("\n")

pnorm_result <- pnorm(qnorm_result)
cat("pnorm(2.326348) =", pnorm_result)

qnorm(.99) = 2.326348
pnorm(2.326348) = 0.99

<p>
You want to solve for n so you need to restructure qnorm to <br> qnorm(percentage) = qnorm((observation - sample_mean) / sample_standard_deviation) 
</p>

In [34]:
%%R -i p5_population_sd -i p5b_probability -i p5a_avg_sediment_density -i p5_population_mean

# V1
print(((.81 * qnorm(.99)) / (3-2.61)) ^ 2)

# V2
print(((p5_population_sd * qnorm(p5b_probability)) /
       (p5a_avg_sediment_density-p5_population_mean)) ^ 2)

[1] 23.3448
[1] 23.3448
