# Date - 20.03.2019

# ANOVA - one way analysis of variance

In [3]:
import numpy as np
import pandas as pd
import os 
import sklearn

In [5]:
from pydataset import data

import scipy.stats as stats

from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [6]:
#Data of chickwts

df = data('chickwts')
df.head()

Unnamed: 0,weight,feed
1,179,horsebean
2,160,horsebean
3,136,horsebean
4,227,horsebean
5,217,horsebean


In [8]:
#Sample Size 

df.groupby(['feed']).size().reset_index()

Unnamed: 0,feed,0
0,casein,12
1,horsebean,10
2,linseed,12
3,meatmeal,11
4,soybean,14
5,sunflower,12


In [9]:
#Average weight of each sample

df.groupby(['feed'])['weight'].mean().reset_index()

Unnamed: 0,feed,weight
0,casein,323.583333
1,horsebean,160.2
2,linseed,218.75
3,meatmeal,276.909091
4,soybean,246.428571
5,sunflower,328.916667


In [10]:
#H0 : The feed doesn't have no impact on the weight of chickens
#H1 : The feed does imapact the weight of chickens
#Elpha : 0.05

#Creating the array for weight for each category

casein = df['weight'][df['feed']== 'casein']
casein

60    368
61    390
62    379
63    260
64    404
65    318
66    352
67    359
68    216
69    222
70    283
71    332
Name: weight, dtype: int64

In [11]:
horsebean = df['weight'][df['feed']== 'horsebean']
horsebean

1     179
2     160
3     136
4     227
5     217
6     168
7     108
8     124
9     143
10    140
Name: weight, dtype: int64

In [12]:
linseed = df['weight'][df['feed']== 'linseed']
linseed

11    309
12    229
13    181
14    141
15    260
16    203
17    148
18    169
19    213
20    257
21    244
22    271
Name: weight, dtype: int64

In [13]:
meatmeal = df['weight'][df['feed']== 'meatmeal']
meatmeal

49    325
50    257
51    303
52    315
53    380
54    153
55    263
56    242
57    206
58    344
59    258
Name: weight, dtype: int64

In [16]:
soybean = df['weight'][df['feed']== 'soybean']
soybean

23    243
24    230
25    248
26    327
27    329
28    250
29    193
30    271
31    316
32    267
33    199
34    171
35    158
36    248
Name: weight, dtype: int64

In [15]:
sunflower = df['weight'][df['feed']== 'sunflower']
sunflower

37    423
38    340
39    392
40    339
41    341
42    226
43    320
44    295
45    334
46    322
47    297
48    318
Name: weight, dtype: int64

In [17]:
#Creating the anova model

stats.f_oneway(casein, horsebean, linseed, meatmeal, soybean, sunflower)

F_onewayResult(statistic=15.364799774712534, pvalue=5.936419853471331e-10)

In [19]:
#H0 Null is Rejected, H1 gets accepted I will perform Tukey HSD Test

HSD = pairwise_tukeyhsd(df['weight'], df['feed'], alpha=0.05)
HSD.summary()

group1,group2,meandiff,lower,upper,reject
casein,horsebean,-163.3833,-232.3445,-94.4222,True
casein,linseed,-104.8333,-170.5852,-39.0814,True
casein,meatmeal,-46.6742,-113.9039,20.5554,False
casein,soybean,-77.1548,-140.5149,-13.7947,True
casein,sunflower,5.3333,-60.4186,71.0852,False
horsebean,linseed,58.55,-10.4112,127.5112,False
horsebean,meatmeal,116.7091,46.3375,187.0806,True
horsebean,soybean,86.2286,19.544,152.9132,True
horsebean,sunflower,168.7167,99.7555,237.6778,True
linseed,meatmeal,58.1591,-9.0705,125.3887,False


In [20]:
path = r"C:\Users\Administrator\Documents"

os.chdir(path)
os.listdir(path)

tyre = pd.read_csv('tyre.csv')

# TYRES

In [22]:
tyre.head()

Unnamed: 0,Brands,Mileage
0,Apollo,32.998
1,Apollo,36.435
2,Apollo,32.777
3,Apollo,37.637
4,Apollo,36.304


In [23]:
tyre.groupby("Brands").size().reset_index()

Unnamed: 0,Brands,0
0,Apollo,15
1,Bridgestone,15
2,CEAT,15
3,Falken,15


In [35]:
tyre_data2 = tyre.groupby("Brands")["Mileage"].mean().reset_index()
tyre_data2

Unnamed: 0,Brands,Mileage
0,Apollo,34.799133
1,Bridgestone,31.780133
2,CEAT,34.761207
3,Falken,37.624667


In [None]:
# H0 : Brands does not effect the Mileage 
# H1 : Brands does empact the Mileage 
#Elpha : 0.05

In [36]:
Apollo = tyre["Mileage"][tyre["Brands"]=="Apollo"]
Bridgestone = tyre["Mileage"][tyre["Brands"]=="Bridgestone"]
CEAT = tyre["Mileage"][tyre["Brands"]=="CEAT"]
Falken = tyre["Mileage"][tyre["Brands"]=="Falken"]

In [37]:
#creating anova model

stats.f_oneway(Apollo,Bridgestone,CEAT,Falken)

F_onewayResult(statistic=17.941513342446925, pvalue=2.78098919789432e-08)

as p value less than 0.05 we will reject null hypothesis and alternative hypo is accepted

now we need to refer the best feed for that we need turkey hsd test

In [38]:
HSD1 = pairwise_tukeyhsd(tyre.Mileage, tyre.Brands, alpha=0.05)
HSD1.summary()

group1,group2,meandiff,lower,upper,reject
Apollo,Bridgestone,-3.019,-5.1289,-0.9091,True
Apollo,CEAT,-0.0379,-2.1478,2.072,False
Apollo,Falken,2.8255,0.7156,4.9354,True
Bridgestone,CEAT,2.9811,0.8712,5.091,True
Bridgestone,Falken,5.8445,3.7346,7.9544,True
CEAT,Falken,2.8635,0.7536,4.9734,True


# Two Way ANOVA

Two way Anova - without replication

H0 - state or exam type does done impact marks
H1 - state or exam does impact by marks
alpha = 0.05

In [24]:
from statsmodels.stats.anova import AnovaRM

In [31]:
marks = pd.read_excel('C:\\Users\\Administrator\\Documents\\Students Marks.xlsx')

In [32]:
marks.head()

Unnamed: 0,State,Exam,Marks
0,student1,IA,96
1,student1,FA,36
2,student2,IA,100
3,student2,FA,48
4,student3,IA,15


In [33]:
#Creating the two way anova model

model = AnovaRM(marks, 'Marks', 'State', within=['Exam'])

fit = model.fit()

fit.summary()                                                               #Exam type has no impact on marks

0,1,2,3,4
,Num DF,Den DF,F Value,Pr > F
Exam,1.0000,9.0000,3.8012,0.0830


# p-value has greater than significance level --> accept null hypothesis



In [34]:
model = AnovaRM(marks, 'Marks', 'Exam', within=['State'])

fit = model.fit()

fit.summary()

0,1,2,3,4
,Num DF,Den DF,F Value,Pr > F
State,9.0000,9.0000,1.2216,0.3852
