In [11]:
import random

def roll_the_dice(n_simulations = 1000):
    '''Two unbiased, six sided, dice are thrown one time and the sum of the
    faces is observed (so if you rolled a 3 and a 1, the sum is 4). A
    simulation estimates probability that the total score is an even number
    or a number greater than 7.  This is an estimated probability, based on
    rolling the two dice n_simulations times.

    FOR FULL POINTS, THE FUNCTION MUST PERFORM A SIMULATION.

    Parameters
    ----------
    n_simulations: float
        Number of rolls of the dice

    Returns
    -------
    float
    '''
    
    for i in range(n_simulations):
           
        die1 = random.randint(1,6+1)
        die2 = random.randint(1,6+1)
        die_sum = die1 + die2
    
        count = 0
        if die_sum%2 == 0 or die_sum > 7:
            count += 1
        
        probability = float(count / n_simulations)
    
    return probability

In [12]:
roll_the_dice(1000)

0.001

In [14]:
import scipy.stats as stats

def calculate_t_test(sample1, sample2, type_I_error_rate):
    '''Evaluates whether the two samples come from a population with the same
    population mean.  Returns a tuple containing the p-value for
    the pair of samples and True or False depending if the p-value is
    considered significant at the provided Type I Error Rate (i.e. false
    positive rate, i.e. alpha).

    You may use imports for this question.

    Parameters
    ----------
    sample1, sample2: NumPy array, NumPy array
    type_I_error_rate: float

    Returns
    -------
    float, boolean
    '''
    p_val = stats.ttest_ind(sample1, sample2)
    
    if p_val < type_I_error_rate:
        significant = True
    
    return p_val, significant

In [51]:
import numpy as np

def add_column(arr, col):
    '''Return a numpy array containing arr with col added as a final column.
    You can assume that the number of rows in arr is the same as the length
    of col.

    FOR FULL POINTS DON'T USE LOOPS OR LIST COMPREHENSIONS.

    Parameters
    ----------
    arr: NumPy Array (2-dimensional)
    col: NumPy array

    Returns
    -------
    NumPy Array (2-dimensional)

    >>>add_column(np.array([[1, 2], [3, 4]]), np.array([5, 6)))
    np.array([[1, 2, 5], [3, 4, 6]])
    '''
    new_arr = np.concatenate((arr, col.T), axis=1)
    return new_arr

In [53]:
#add_column(np.array([[1,2],[3,4]]), np.array([[5,6]]))

In [58]:
def only_positive(arr):
    '''Return a numpy array containing only the rows from arr where all
    the values in that row are positive.

    FOR FULL POINTS USE NUMPY METHODS AND NO LOOPS OR LIST COMPREHENSIONS.

    Parameters
    ----------
    arr: NumPy Array (2-dimensional)

    Returns
    -------
    NumPy Array (2-dimensional)

    >>>np.array([[1, -1, 2],
                    [3, 4, 2],
                    [-8, 4, -4]])
    np.array([[3, 4, 2]])
    '''
    
    pos_arr = arr[np.min(arr, 1) > 0]
    
    return pos_arr

In [59]:
# only_positive([[1, -1, 2],
#                     [3, 4, 2],
#                     [-8, 4, -4]])

In [61]:
def df_to_numpy(df, y_column):
    '''Convert the column defined y_column into a NumPy array (y) and
    convert the rest of df into a 2 dimensional NumPy array (X).
    Returns (X, y).

    Parameters
    ----------
    df: Pandas DataFrame
    y_column: string

    Returns
    -------
    NumPy array, NumPy array

    Example
    -------
    >>> df = pd.DataFrame([[1,3,5],[2,4,6]], columns = ['a','b','c'])
    >>> df_to_numpy(df, 'c')
    np.array([[1, 3], [2, 4]]), np.array([5, 6])
    '''
    
    arr1 = df.drop(y_column, axis=1)
    arr2 = df[y_column]

    return arr1.values, arr2.values
    

In [63]:
# df_to_numpy([[1,3,5],[2,4,6]], 'c')

In [15]:
def pandas_query(df):
    '''Returns a DataFrame containing the average size of each university
    type ordered by average size in ascending order.

    Function assumes the input DataFrame contains these columns:
        name, address, Website, Type, Size

    Parameters
    ----------
    df: Pandas DataFrame

    Returns
    -------
    Pandas DataFrame
    '''
    
    df = groupby('Type')['Size'].mean().sort_values(by='Size')
    
    return df

A coin is biased at 0.6 in favor of heads. What is the probability of flipping 8 or more heads in 10 flips of this coin?

In [10]:
from scipy.stats import binom

x = binom.pmf(8,10,0.6) + binom.pmf(9,10,0.6) + binom.pmf(10,10,0.6)
x

0.16728975359999998

What is P(x=T | y=b)?

In [7]:
P = 0.1 / (0.1 + 0.15) 
P

0.4

A particular hockey team has a long-standing record of 540 wins, 60 ties and 400 losses.

In 65% of the games they won, they had a lead at the start of the second period.
In 45% of the games they tied, they had a lead at the start of the second period.
In 39% of the games they lost, they held a lead at the start of the second period.
a) Knowing that they now have a lead at the start of the second period, what is the probability that they will win this game?

After taking an intro probability class (and dozing off during part of it), the coach has come to the conclusion that a hockey game is essentially a bernoulli trial, with an outcome governed by a parameter p. Since p can be measured from the outcome of previous games, he has decided it doesn't matter what he does, so he benched his regular starters and is giving his less-experienced players more time on the ice.

b) What error has the coach made?

P(Win|Lead) = (P(Lead|Win) * P(Win)) / P(Lead) 
            = (0.65 * (540/(540+60+400)) / P(Lead) 
            = (0.65 * 0.54) / P(Lead)
            = 0.692

P(Lead) = P(Lead|Win) * P(Win) + P(Lead|Tie) * P(Tie) + P(Lead|Lose) * P(Lose) 
        = (0.6 * 0.54) + (0.45 * (60/1000)) + (0.39 * (400/1000)) 
        = 0.507
        
The coach made the mistake of assuming that the probability would not change with different players.

The total number of log-ins to a website for 20 different randomly selected users from 2018 are listed below (which you can treat as a sample from the population of all users of the website in 2018):

[10, 25, 12, 35, 14, 18, 16, 15, 22, 10, 9, 11, 49, 20, 15, 9, 18, 19, 20, 20]

a) What is the sample mean?

b) What is the sample variance? Note: This is not the same as the population variance.

a) 18.35

b) 91.50

The website owner from the previous question would like to know, with 95% confidence, an interval that contains the mean number of log-ins of all users of the website in 2019. Without actually calculating the confidence interval, name and briefly explain two techniques you'd use to calculate it.

You could use either the Central Limit Theorem or Bootstrapping. 

Central Limit Theorem - you would get the samples mean, number of samples and variance and using a significance level you could calculate the confidence interval.

Bootstrapping - you would draw with replacement from a sample and from each you would find the mean. Using those distributions you could find the confidence interval.