In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from scipy import stats

### Q1

In [2]:
df = pd.read_csv("Cutlets.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Unit A  35 non-null     float64
 1   Unit B  35 non-null     float64
dtypes: float64(2)
memory usage: 688.0 bytes


In [3]:
df.head()

Unnamed: 0,Unit A,Unit B
0,6.809,6.7703
1,6.4376,7.5093
2,6.9157,6.73
3,7.3012,6.7878
4,7.4488,7.1522


### 2 Sample t-test

In [4]:
# Null hypothesis H0: There is no difference in diameter between 2 units.
# Alternate hypothesis Ha: There is some difference between the 2 diameter.

stats.ttest_ind(df["Unit A"],df["Unit B"],alternative="two-sided")

Ttest_indResult(statistic=0.7228688704678063, pvalue=0.4722394724599501)

In [5]:
#pvalue = 0.4722 is greater than alpha = 0.05
# So we can say that the chance of the diameters to be same is 47% 

### Q2

In [6]:
df_lb = pd.read_csv("LabTAT.csv")
df_lb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Laboratory 1  120 non-null    float64
 1   Laboratory 2  120 non-null    float64
 2   Laboratory 3  120 non-null    float64
 3   Laboratory 4  120 non-null    float64
dtypes: float64(4)
memory usage: 3.9 KB


In [7]:
df_lb.head()

Unnamed: 0,Laboratory 1,Laboratory 2,Laboratory 3,Laboratory 4
0,185.35,165.53,176.7,166.13
1,170.49,185.91,198.45,160.79
2,192.77,194.92,201.23,185.18
3,177.33,183.0,199.61,176.42
4,193.41,169.57,204.63,152.6


### ANOVA

In [8]:
# Null hypothesis H0: There is no difference in average TAT(turn around time) between 4 labs.
# Alternate hypothesis Ha: There is some difference between the average TAT(turn around time) of 4 labs.

In [9]:
stats.f_oneway(df_lb["Laboratory 1"],df_lb["Laboratory 2"],df_lb["Laboratory 3"],df_lb["Laboratory 4"])

F_onewayResult(statistic=118.70421654401437, pvalue=2.1156708949992414e-57)

In [10]:
#pvalue = 2.11567e-57 is less than alpha = 0.05
# So we can say that the chance of the average TAT(turn around time) between 4 labs being same is 2.115^-57% or less than 1%

### Q3

In [11]:
df_br = pd.read_csv("BuyerRatio.csv")
df_br.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Observed Values  2 non-null      object
 1   East             2 non-null      int64 
 2   West             2 non-null      int64 
 3   North            2 non-null      int64 
 4   South            2 non-null      int64 
dtypes: int64(4), object(1)
memory usage: 208.0+ bytes


In [12]:
df_br

Unnamed: 0,Observed Values,East,West,North,South
0,Males,50,142,131,70
1,Females,435,1523,1356,750


### Chi-Square test

In [13]:
# Null hypothesis H0: All proportions are equal.
# Alternate hypothesis Ha: Not all proportions are equal.

In [14]:
dft = df_br.iloc[:,1:6]
dft

Unnamed: 0,East,West,North,South
0,50,142,131,70
1,435,1523,1356,750


In [15]:
stat,p,dof,expected = stats.chi2_contingency(dft)

In [16]:
print(f"stat = {stat}, p = {p}, dof = {dof}, expected = {expected}")

stat = 1.595945538661058, p = 0.6603094907091882, dof = 3, expected = [[  42.76531299  146.81287862  131.11756787   72.30424052]
 [ 442.23468701 1518.18712138 1355.88243213  747.69575948]]


In [17]:
# Since p-value (0.66) > alpha (0.05) we accept null hypothesis

### Q4

In [18]:
dtc = pd.read_csv("Costomer+OrderForm.csv")
dtc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Phillippines  300 non-null    object
 1   Indonesia     300 non-null    object
 2   Malta         300 non-null    object
 3   India         300 non-null    object
dtypes: object(4)
memory usage: 9.5+ KB


In [19]:
dtc.head()

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,Error Free,Error Free,Defective,Error Free
1,Error Free,Error Free,Error Free,Defective
2,Error Free,Defective,Defective,Error Free
3,Error Free,Error Free,Error Free,Error Free
4,Error Free,Error Free,Defective,Error Free


In [20]:
# Getting values for contigency table.

e1,d1 = dtc["Phillippines"].value_counts()
e2,d2 = dtc["Indonesia"].value_counts()
e3,d3 = dtc["Malta"].value_counts()
e4,d4 = dtc["India"].value_counts()

In [21]:
# Preparing contigency table

data = np.array([[e1,e2,e3,e4],[d1,d2,d3,d4]])
data

array([[271, 267, 269, 280],
       [ 29,  33,  31,  20]])

### Chi-Square

In [22]:
# Null hypothesis H0: Defective % is independent of center's location.
# Alternate hypothesis Ha: Defective % is dependent of center's location.

stat_1,p_1,dof_1,expected_1 = stats.chi2_contingency(data)

In [23]:
print(f"stat = {stat_1}, p = {p_1}, dof = {dof_1}, expected = {expected_1}")

stat = 3.858960685820355, p = 0.2771020991233135, dof = 3, expected = [[271.75 271.75 271.75 271.75]
 [ 28.25  28.25  28.25  28.25]]


In [24]:
# p-value (0.277) > alpha (0.05) 
# The chances of defective % being independent of center's location are 27.71%