In [None]:
#PART_1

In [62]:
#1.	A company wants to test whether the average time taken to assemble a product has significantly decreased after implementing a new training program. 
#Prior to the training, the average assembly time was 35 minutes with a known population standard deviation of 5 minutes. 
#After the training, a random sample of 40 employees showed a mean assembly time of 33 minutes. 
#Can you help the company to decide whether the new training program is necessary?

In [64]:
import scipy.stats as t
# Given data
population_mean_before = 35  # minutes
std_dev = 5  # minutes
sample_size = 40
sample_mean_after = 33  # minutes
alpha = 0.05  # significance level for a one-tailed test since we are only interested in decreases in assembly time.

In [66]:
# 1. The initial assumption of population distribution

intial_assumption = "Population is normally distributed."
print(intial_assumption)

Population is normally distributed.


In [68]:
# 2. State the null and alternative hypotheses

hypotheses = {
    "null hypothesis": "H0: mu = 35", #The average assembly time after training is 35 minutes
    "alternative hypothesis": "H1: mu < 35" #The average assembly time after training has significantly decreased,
}
print(hypotheses)

{'null hypothesis': 'H0: mu = 35', 'alternative hypothesis': 'H1: mu < 35'}


In [70]:
# 3. Determine the test statistic

z_test_statistic = (sample_mean_after - population_mean_before) / (std_dev / (sample_size**0.5)) 
print(f"Test statistic: {z_test_statistic:}")

Test statistic: -2.5298221281347035


In [72]:
# 4. Find the critical value and decision rule

critical_value = t.norm.ppf(1 - alpha)
decision_rule = f"Reject H0 if Z < {-critical_value}" #for a left-tailed test 
print(f"Critical value: {critical_value:}")
print(f"Decision rule: {decision_rule}")

Critical value: 1.6448536269514722
Decision rule: Reject H0 if Z < -1.6448536269514722


In [74]:
# 5. Calculate the p-value

p_value = t.norm.cdf(z_test_statistic)
print(f"p-value: {p_value:}")

p-value: 0.005706018193000826


In [76]:
# 6. Draw a conclusion based on the significance level

#Since the z-score of -2.53 is less than the critical value of -1.645, and the p-value (0.0057) is less than the significance level 
# we reject the null hypothesis.

print("Reject the null hypothesis")
print("The new training program has significantly decreased the average assembly time.")

Reject the null hypothesis
The new training program has significantly decreased the average assembly time.


In [79]:
#2.	A university administrator wants to test whether graduate students at the institution study, on average, more than 25 hours per week. 
#To explore this, a random sample of 15 graduate students was surveyed, and the sample mean study time was found to be 27 hours per week, 
#with a sample standard deviation of 4.5 hours.

In [81]:
import scipy.stats as stats

# Given data
population_mean = 25  # hours per week
sample_mean = 27      # hours per week
sample_std = 4.5      # hours per week
sample_size = 15
alpha = 0.05          # Assume a significance level as 0.05 for a one-tailed test since we are only interested in increases in study time.

In [83]:
# 1. Initial assumption of population distribution

initial_assumption = "The population is normally distributed."
print(intial_assumption)

Population is normally distributed.


In [85]:
# 2. State the null and alternative hypotheses

hypotheses = {
    "null hypothesis": "H0: mu = 25", #Graduate students study 25 hours per week on average
    "alternative hypothesis": "H1: mu > 25" #Graduate students study more than 25 hours per week on average
}
print(hypotheses)

{'null hypothesis': 'H0: mu = 25', 'alternative hypothesis': 'H1: mu > 25'}


In [87]:
# 3. Determine the test statistic

#Since the sample size is small (n = 15) and we do not know the population standard deviation, we use a t-test for this hypothesis test.
t_statistic = (sample_mean - population_mean) / (sample_std / (sample_size**0.5))
print(f"Test statistic (t): {t_statistic}")


Test statistic (t): 1.7213259316477407


In [89]:
# 4. Find the critical value and decision rule

df = sample_size - 1  # degrees of freedom
critical_value = stats.t.ppf(1 - alpha, df)
print(f"Critical value: {critical_value}")
print(f"Decision rule: Reject H0 if t > {critical_value}")

Critical value: 1.7613101357748562
Decision rule: Reject H0 if t > 1.7613101357748562


In [91]:
# 5. Calculate the p-value

#calculating the p-value using the t-distribution.
p_value = 1 - stats.t.cdf(t_statistic, df)
print(f"p-value: {p_value}")

p-value: 0.05360191367469436


In [93]:
# 6. Draw a conclusion based on the significance level

#Since the t-score of 1.72 is slightly below the critical value of 1.761, and the p-value (0.053) is greater than the significance level, 
#we fail to reject the null hypothesis.

print("Fail to reject the null hypothesis.")
print("There is not enough evidence to conclude that graduate students study, on average, more than 25 hours per week.")

Fail to reject the null hypothesis.
There is not enough evidence to conclude that graduate students study, on average, more than 25 hours per week.


In [96]:
#PART_2

In [98]:
#Regression

In [100]:
#a.	Fit a simple linear regression model to the data, where the dependent variable (Y) is the exam score, and the independent variable (X) 
#is the hours of study.

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

data = pd.DataFrame({'Hours of Study': [5, 5, 7, 3, 0, 5, 2, 7, 4, 2, 1, 6, 8, 5, 7, 8, 8, 8, 0, 8, 7, 8, 8, 0, 4, 6, 2, 7, 5, 4],
    'Score': [52.1221, 52.1221, 72.1221, 32.1221, 2.122104, 52.1221,22.1221, 72.1221, 42.1221, 22.1221,12.1221, 62.1221, 82.1221, 52.1221,72.1221, 
              72.1221, 82.1221,82.1221,2.122104,82.1221,72.1221,82.1221,82.1221,2.122104,42.1221,62.1221,22.1221,82.1221,52.1221,42.1221]})

#X = hours of study and y = score
X = data['Hours of Study'].values.reshape(-1, 1)
y = data['Score'].values

# Fit a simple linear regression model
model = LinearRegression()
model.fit(X, y)

intercept = model.intercept_
slope = model.coef_[0]
r_squared = model.score(X, y)

print(f"Simple Linear Regression Model:")
print(f"Equation: Score = {intercept} + {slope} * Hours of Study")
print(f"Intercept (β₀): {intercept}")
print(f"Slope (β₁): {slope}")
print(f"R-squared: {r_squared}")

Simple Linear Regression Model:
Equation: Score = 2.3601970666666787 + 9.952380666666667 * Hours of Study
Intercept (β₀): 2.3601970666666787
Slope (β₁): 9.952380666666667
R-squared: 0.9904988659423376


In [102]:
#b.	What are the assumptions of simple linear regression, and do you think they hold for this dataset?

#Linearity: The relationship between the independent variable (hours of study) and the dependent variable (exam score) should be linear. 
#Independence of Errors: Observations should be independent, meaning the residuals (errors) are not correlated. 
#Homoscedasticity: The variance of errors should be constant across all levels of the independent variable (no pattern in residual variance). 
#Normality of Errors: The residuals should be approximately normally distributed. 
#No Multicollinearity: This applies to multiple linear regression but is irrelevant here, as we only have one independent variable.

In [104]:
#c.	How would outliers in the data affect the regression model?

#Outliers can skew the regression line, affecting the slope and intercept, increasing residual variance, and potentially violating 
#assumptions like homoscedasticity and normality, which reduces the model’s accuracy and reliability.

In [106]:
#d.	If you need to verify the model what will you need and what will you do?

#Residual plots to check assumptions of linearity, homoscedasticity, and normality of residuals.
#Q-Q plot to further assess the normality of residuals.
#Leverage plots to identify influential points or outliers.
#Cross-validation data: A separate dataset to test the model's predictive performance.

In [109]:
#PART3

In [111]:
#a.	Please formulate this problem as a linear programming problem in standard form. 
#b.	The problem is bounded by machine X or Y? If you have a budget to upgrade the machine, will you upgrade machine X or Y? 
    #Please explain your answer by exploring the math model. 


In [113]:
import pulp

# Create the LP problem
prob = pulp.LpProblem("Maximize Production", pulp.LpMaximize)

# Define decision variables
# These represent the number of units to produce for each product
x1 = pulp.LpVariable("Product_A", lowBound=0)  # Product A
x2 = pulp.LpVariable("Product_B", lowBound=0)  # Product B
x3 = pulp.LpVariable("Product_C", lowBound=0)  # Product C

# Define the objective function
# We want to maximize the total production (sum of all products)
prob += x1 + x2 + x3, "Total Production"

# Define constraints
# Machine X constraint: 2 hours for A, 1 hour for B, 3 hours for C, total 100 hours available
prob += 2*x1 + x2 + 3*x3 <= 100, "Machine_X_Constraint"
# Machine Y constraint: 4 hours for A, 3 hours for B, 2 hours for C, total 85 hours available
prob += 4*x1 + 3*x2 + 2*x3 <= 85, "Machine_Y_Constraint"

# Solve the problem
prob.solve()

# Print the results
print("Status:", pulp.LpStatus[prob.status])
print("\nOptimal Production:")
print("Product A:", x1.varValue)
print("Product B:", x2.varValue)
print("Product C:", x3.varValue)
print("\nTotal Production:", pulp.value(prob.objective))

# Check which machine is limiting production
machine_x_usage = 2*x1.varValue + x2.varValue + 3*x3.varValue
machine_y_usage = 4*x1.varValue + 3*x2.varValue + 2*x3.varValue

print("\nMachine Usage:")
print(f"Machine X: {machine_x_usage:.2f} / 100 hours")
print(f"Machine Y: {machine_y_usage:.2f} / 85 hours")

# Determine which machine(s) are fully utilized
if abs(machine_x_usage - 100) < 1e-6 and abs(machine_y_usage - 85) < 1e-6:
    print("\nBoth machines are fully utilized and limiting production.")
elif abs(machine_x_usage - 100) < 1e-6:
    print("\nMachine X is fully utilized and limiting production.")
    print("Upgrading Machine X would be more beneficial.")
elif abs(machine_y_usage - 85) < 1e-6:
    print("\nMachine Y is fully utilized and limiting production.")
    print("Upgrading Machine Y would be more beneficial.")
else:
    print("\nNeither machine is fully utilized. No upgrade is necessary.")

# Calculate shadow prices
# Shadow prices indicate how much the objective value would improve
# for a unit increase in the right-hand side of the constraint
machine_x_constraint = prob.constraints["Machine_X_Constraint"]
machine_y_constraint = prob.constraints["Machine_Y_Constraint"]

print("\nShadow Prices:")
print(f"Machine X: {machine_x_constraint.pi:.4f}")
print(f"Machine Y: {machine_y_constraint.pi:.4f}")

print("\nInterpretation:")
if machine_x_constraint.pi > machine_y_constraint.pi:
    print("Upgrading Machine X would yield a higher increase in total production.")
else:
    print("Upgrading Machine Y would yield a higher increase in total production.")


Status: Optimal

Optimal Production:
Product A: 0.0
Product B: 7.8571429
Product C: 30.714286

Total Production: 38.5714289

Machine Usage:
Machine X: 100.00 / 100 hours
Machine Y: 85.00 / 85 hours

Both machines are fully utilized and limiting production.

Shadow Prices:
Machine X: 0.1429
Machine Y: 0.2857

Interpretation:
Upgrading Machine Y would yield a higher increase in total production.




In [None]:
#upgrading Machine Y is prioritized because it is the binding constraint that limits production the most. 
#Addressing this constraint will yield immediate benefits in terms of production capacity and efficiency. 
#Upgrading Machine X may be beneficial in the future, especially if production plans diversify or increase significantly. 
#Thus, the focus should be on Machine Y to maximize production capabilities effectively.



