# Tests of hypothesis

In [1]:
import scipy.stats as stats
import pandas as pd
import numpy as np

## Chi-squared test
Hypothesis: There is a same probability of fatal accident on main road and third level road.

In [2]:
# load data
df = pd.read_pickle('accidents.pkl.gz')
df

Unnamed: 0,p1,p36,p37,p2a,weekday(p2a),p2b,p6,p7,p8,p9,...,l,n,o,p,q,r,s,t,p5a,region
0,002100160001,4,-1,2016-01-01,5,55,1,1,0,2,...,,711403,,Souhlasnýsesměremúseku,Pomalý,554782,451622,GN_V0.1UIR-ADR_410,1,PHA
1,002100160002,4,-1,2016-01-01,5,130,1,3,0,2,...,,,,,,-1,-1,,1,PHA
2,002100160003,5,-1,2016-01-01,5,100,1,2,0,2,...,,,,,,-1,-1,,1,PHA
3,002100160004,6,-1,2016-01-01,5,120,9,0,0,2,...,,,,,,-1,-1,,1,PHA
4,002100160005,6,-1,2016-01-01,5,2560,2,0,0,2,...,,,,,,-1,-1,,1,PHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572929,190906210457,6,-1,2021-08-30,1,506,2,0,0,2,...,,2611387,,,Pomalý,560359,-1,GN_V0.1UIR-ADR_410,1,KVK
572930,190906210458,6,-1,2021-08-30,1,752,2,0,0,2,...,,2541503,,Opačnýkesměruúseku,Pomalý,560383,550426,GN_V0.1UIR-ADR_410,1,KVK
572931,190906210459,3,2099,2021-08-30,1,1505,1,3,0,2,...,,,,,,-1,-1,,1,KVK
572932,190906210460,3,2099,2021-08-31,2,1245,0,0,0,1,...,2099,199764,,Opačnýkesměruúseku,Pomalý,560286,553654,GN_V0.1UIR-ADR_410,1,KVK


Let's filter out accidents on the roads that we want to test.

In [3]:
# filter out types of road
df2 = df.copy(deep=True)
df2 = df2.loc[(df2['p36'] == 1) | (df2['p36'] == 3)]
df2

Unnamed: 0,p1,p36,p37,p2a,weekday(p2a),p2b,p6,p7,p8,p9,...,l,n,o,p,q,r,s,t,p5a,region
7594,002100167619,1,-1,2016-05-05,4,1135,1,2,0,2,...,R1,1397508,,Souhlasnýsesměremúseku,Pomalý,554782,709450,GN_V0.1UIR-ADR_410,1,PHA
10530,003100160575,1,102,2016-06-19,0,1905,1,2,0,2,...,102,312370,,Souhlasnýsesměremúseku,Pomalý,554782,481548,GN_V0.1UIR-ADR_410,1,PHA
10532,003100160577,1,0,2016-06-19,0,1915,3,0,4,2,...,R1,293620,,Souhlasnýsesměremúseku,Pomalý,554782,709450,GN_V0.1UIR-ADR_410,1,PHA
13362,003100163418,1,0,2016-08-08,1,1355,1,2,0,2,...,,2374326,,Souhlasnýsesměremúseku,Pomalý,554782,506001,GN_V0.1UIR-ADR_410,1,PHA
21478,004100161576,1,-1,2016-12-09,5,1640,2,0,0,2,...,,,,,,-1,-1,GN_V0.1UIR-ADR_410,1,PHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572914,190906210442,3,21036,2021-08-23,1,1238,3,0,9,1,...,21036,2245808,,Opačnýkesměruúseku,Pomalý,560341,-1,GN_V0.1UIR-ADR_410,2,KVK
572922,190906210450,3,21810,2021-08-26,4,1704,1,1,0,2,...,21810,3343415,,Souhlasnýsesměremúseku,Pomalý,560677,-1,GN_V0.1UIR-ADR_410,1,KVK
572931,190906210459,3,2099,2021-08-30,1,1505,1,3,0,2,...,,,,,,-1,-1,,1,KVK
572932,190906210460,3,2099,2021-08-31,2,1245,0,0,0,1,...,2099,199764,,Opačnýkesměruúseku,Pomalý,560286,553654,GN_V0.1UIR-ADR_410,1,KVK


Since we don't need the number of fatal injuries, let's create new column named 'fatal' that stores boolean value
whether the accident has at least one fatal injury or not.

In [4]:
df2['fatal'] = df2['p13a'] > 0
df2


Unnamed: 0,p1,p36,p37,p2a,weekday(p2a),p2b,p6,p7,p8,p9,...,n,o,p,q,r,s,t,p5a,region,fatal
7594,002100167619,1,-1,2016-05-05,4,1135,1,2,0,2,...,1397508,,Souhlasnýsesměremúseku,Pomalý,554782,709450,GN_V0.1UIR-ADR_410,1,PHA,False
10530,003100160575,1,102,2016-06-19,0,1905,1,2,0,2,...,312370,,Souhlasnýsesměremúseku,Pomalý,554782,481548,GN_V0.1UIR-ADR_410,1,PHA,False
10532,003100160577,1,0,2016-06-19,0,1915,3,0,4,2,...,293620,,Souhlasnýsesměremúseku,Pomalý,554782,709450,GN_V0.1UIR-ADR_410,1,PHA,False
13362,003100163418,1,0,2016-08-08,1,1355,1,2,0,2,...,2374326,,Souhlasnýsesměremúseku,Pomalý,554782,506001,GN_V0.1UIR-ADR_410,1,PHA,False
21478,004100161576,1,-1,2016-12-09,5,1640,2,0,0,2,...,,,,,-1,-1,GN_V0.1UIR-ADR_410,1,PHA,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572914,190906210442,3,21036,2021-08-23,1,1238,3,0,9,1,...,2245808,,Opačnýkesměruúseku,Pomalý,560341,-1,GN_V0.1UIR-ADR_410,2,KVK,False
572922,190906210450,3,21810,2021-08-26,4,1704,1,1,0,2,...,3343415,,Souhlasnýsesměremúseku,Pomalý,560677,-1,GN_V0.1UIR-ADR_410,1,KVK,False
572931,190906210459,3,2099,2021-08-30,1,1505,1,3,0,2,...,,,,,-1,-1,,1,KVK,False
572932,190906210460,3,2099,2021-08-31,2,1245,0,0,0,1,...,199764,,Opačnýkesměruúseku,Pomalý,560286,553654,GN_V0.1UIR-ADR_410,1,KVK,False


After that we pull all the data into contingency table...

In [5]:
ct = pd.crosstab(
    index=df2['p36'],
    columns=df2['fatal']
)
ct

fatal,False,True
p36,Unnamed: 1_level_1,Unnamed: 2_level_1
1,78618,911
3,73352,448


... and make a chi-squared test.

In [6]:
result = stats.chi2_contingency(ct)
result

(125.72070150000258,
 3.5395243450138555e-29,
 1,
 array([[78824.11109444,   704.88890556],
        [73145.88890556,   654.11109444]]))

Last thing to do is to compare expected results with real world data.

In [7]:
ct - result[3]


fatal,False,True
p36,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-206.111094,206.111094
3,206.111094,-206.111094


### Conclusion
As we can see from the results **p-value** of the test is $3.56 \cdot 10^{-29}$ which is less than $0.05$. Meaning we consider null hypothesis to be false and we declare that there exists a correlation between the probability of fatal accidents on main roads and third level roads.

When comparing with expected results we can clearly see that main roads caused more fatal injuries overall.

## Second hypothesis
Hypothesis: Skoda cars are less damaged in the accident than Audi cars.

First of all we filter rows with selected cars.

In [8]:
df2 = df.copy(deep=True)
df2 = df2.loc[(df2['p45a'] == 39) | (df2['p45a'] == 2)]
df2['p45a'] = df2['p45a'].replace({2: 'Audi', 39: 'Skoda'})
df2

Unnamed: 0,p1,p36,p37,p2a,weekday(p2a),p2b,p6,p7,p8,p9,...,l,n,o,p,q,r,s,t,p5a,region
0,002100160001,4,-1,2016-01-01,5,55,1,1,0,2,...,,711403,,Souhlasnýsesměremúseku,Pomalý,554782,451622,GN_V0.1UIR-ADR_410,1,PHA
6,002100160007,6,-1,2016-01-01,5,230,3,0,9,2,...,,2777321,,Souhlasnýsesměremúseku,Pomalý,554782,469963,GN_V0.1UIR-ADR_410,1,PHA
9,002100160010,5,-1,2016-01-01,5,450,1,2,0,2,...,,,,,,-1,-1,,1,PHA
11,002100160012,5,-1,2016-01-01,5,640,4,0,0,1,...,,,,,,-1,-1,,1,PHA
16,002100160017,5,-1,2016-01-01,5,1245,3,0,4,2,...,,,,,,-1,-1,,1,PHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572911,190906210439,6,-1,2021-08-21,6,2130,2,0,0,2,...,,186327,,Souhlasnýsesměremúseku,Pomalý,560383,550809,GN_V0.1UIR-ADR_410,1,KVK
572916,190906210444,6,-1,2021-08-23,1,1531,3,0,6,2,...,,1669799,,Opačnýkesměruúseku,Pomalý,554961,185761,GN_V0.1UIR-ADR_410,1,KVK
572921,190906210449,6,-1,2021-08-26,4,1610,3,0,9,2,...,,2611909,,,Pomalý,560545,-1,GN_V0.1UIR-ADR_410,2,KVK
572923,190906210451,6,-1,2021-08-20,5,1515,2,0,0,2,...,,184472,,Souhlasnýsesměremúseku,Pomalý,560383,550736,GN_V0.1UIR-ADR_410,1,KVK


Then we create contingency table...

In [9]:
ct = pd.crosstab(columns=df2['p45a'], index=df2['p53'])
ct

p45a,Audi,Skoda
p53,Unnamed: 1_level_1,Unnamed: 2_level_1
0,764,8203
1,12,240
2,1,74
3,1,13
4,0,7
...,...,...
9000,4,2
10000,10,8
12000,2,0
15000,3,1


... and calculate the T-test for two indipendent samples.

Our null hypothesis is that Audi cars involved in an accident have less vehicle damage than Skoda cars.

In [10]:
# The mean of the distribution underlying the first sample (Audi) 
# is less than the mean of distribution underlying seccond sample (Skoda) 
stats.ttest_ind(ct['Audi'], ct['Skoda'], equal_var=False,
                random_state=False, alternative='less')

Ttest_indResult(statistic=-3.417269886834147, pvalue=0.00039289714219923434)

## Conclusion
The final result shows that **p-value** of the test is $3.92 \cdot 10^{-4}$ which is less than $0.05$.
That means that our null hypothesis regarding Audi cars to received less vehicle damage is false and therefore
our original hypothesis than Skoda cars receive less vehicle damage is true.