# Instructions

The assignment consists of three tasks:

- Run the T-test for the means of two independent samples underlying the statement "IRE binding activity was significantly reduced in failing hearts" (originally published by Haddad et al. in https://doi.org/10.1093/eurheartj/ehw333) using the following example data.

| non-failing heart (NF) | failing heart (F) |
| ---------------------- | ----------------- |
| 95 | 50 |
| 103 | 35 |
| 99 | 21 | 
| &nbsp; | 15 | 
| &nbsp; | 7 | 
| &nbsp; | 40 |

- Describe the statistical hypothesis test in machine readable form following the [statistical methods ontology concept for "two sample t-test with unequal variance"](https://www.ebi.ac.uk/ols/ontologies/stato/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FSTATO_0000304) using Semantic Web technologies, namely the Resource Description Framework (RDF)
- Process the resulting machine readable description using Semantic Web technologies, namely the SPARQL Protocol and RDF Query Language.

Please return the assignment with all outputs visible (i.e., do not clear the outputs).

Good luck!

In [132]:
!pip install rdflib pandas scipy numpy



In [133]:
# Import all required libraries (some are missing)
import numpy as np
from scipy import stats
from rdflib import Graph, URIRef
from rdflib.namespace import RDF
from rdflib import BNode, Literal
from rdflib.namespace import XSD

In [134]:
# Run the T-test for the means of two independent samples using the example data

# Independent T-test function
def independent_ttest(data1, data2, alpha):
	# calculate means
	s1mean, s2mean = np.mean(data1), np.mean(data2)
	# calculate standard errors
	Se1, Se2 = stats.sem(data1), stats.sem(data2)
	# standard error on the difference between the samples
# 	print(len(data1))
	sed = np.sqrt((Se1**2.0/len(data1)) + (Se2**2.0)/len(data2))
	#print(sed)
	# calculate the t statistic
	t_stat = (s1mean - s2mean) / sed
	# degrees of freedom
	df = len(data1) + len(data2) - 2
	# calculate the critical value
	cv = stats.t.ppf(1.0 - alpha, df)
	# calculate the p-value
	p = (1.0 - stats.t.cdf(abs(t_stat), df)) * 2.0
	# return everything
	return t_stat, df, cv, p


def Welch_t_test(x,y):
	dof = (np.var(x)/len(x) + np.var(y)/len(y))**2 / ((np.var(x)/len(x))**2 / (len(x)-1) + (np.var(y)/len(y))**2 / (len(y)-1))
# 	print(f"Welch-Satterthwaite Degrees of Freedom= {dof:.4f}")
	t, p = stats.ttest_ind(x, y, equal_var = False)
	print("\n",f"Welch's t-test= {t:.4f}", "\n",f"p-value = {p:.4f}", "\n",f"Welch-Satterthwaite Degrees of Freedom= {dof:.4f}")
	return p;
        



#Sample Size
sample1=[95,103,99]
sample2=[50,35,21,15,7,40]
alpha=0.05
print("-------")
p=Welch_t_test(sample1, sample2)
print("-------")




#calculate Student T-test
# t_stat, df, cv, p =independent_ttest(sample1,sample2,alpha)
# print(t_stat)
#print('T_stat=%.3f, degree of freedom=%d, citical value=%.3f, p_value=%.3f' % (t_stat, df, cv, p))
# print("T_stat={} , degree of freedom={} , citical value= {}, p_value= {}".format(t_stat, df, cv, p ))

if p > alpha:
	print('Accept null hypothesis that the means are equal...')
else:
	print('Reject the null hypothesis that the means are equal...')

# print("IRE binding activity was significantly reduced in failing hearts-----Means are not equal for two independent sample Sample")





-------

 Welch's t-test= 10.0543 
 p-value = 0.0001 
 Welch-Satterthwaite Degrees of Freedom= 5.8693
-------
Reject the null hypothesis that the means are equal...


In [136]:
# Describe the statistical hypothesis test in machine readable form

# First, we initialize an RDF Graph and bind some prefixes
# n=namespace("http://www.w3.org/2001/XMLSchema#")
g = Graph()
g.bind('obo', 'http://purl.obolibrary.org/obo/')
g.bind('ex', 'http://example.org/')

# Next, we define some needed vocabulary
obo = dict()
obo['two sample t-test with unequal variance'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000304')
obo['has specified input'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000293')
obo['has specified output'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000299')
obo['is_specified_output_of']=URIRef('http://purl.obolibrary.org/obo/OBI_0000312')
obo['inferential statistical data analysis']=URIRef('http://purl.obolibrary.org/obo/OBCS_0000121')
obo['p-value'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000175')
obo['has value specification'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001938')
obo['scalar value specification'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001931')
obo['has specified numeric value'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001937')
obo['iron-responsive element binding'] = URIRef('http://purl.obolibrary.org/obo/GO_0030350')
# print('Data=',obo)




# Now, populate the graph with statements
# As an example, the following statement types the resource (blank node) 'n1' as 'two sample t-test with unequal variance'
n1 = BNode()
g.add((n1, RDF.type, obo['two sample t-test with unequal variance']))

# Now, state that the t-test has iron-responsive element binding as a specified input (one statement to complete)
n2 = BNode()

g.add((n1, obo['has specified input'], n2))
g.add((n2,RDF.type,obo['iron-responsive element binding']))

n3 = BNode()
# Next, state that the t-test has the p-value computed above as a specified output
g.add((n1,obo['has specified output'],n3))


g.add((n3,RDF.type,obo['p-value']))
n4 = BNode()
g.add((n3,obo['has value specification'],n4))
# The literal value of the p-value is represented as a scalar value specification with a specified numeric value 

g.add((n4,RDF.type,obo['scalar value specification']))

# g.add(...)
# Type the literal as double
g.add((n4,obo['has specified numeric value'],Literal(p,datatype=XSD.double)))
# Finally, serialize and print the graph in RDF/XML format
s=g.serialize(format='turtle').decode('utf-8')
print(s)

@prefix ex: <http://example.org/> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

[] a obo:STATO_0000304 ;
    obo:OBI_0000293 [ a obo:GO_0030350 ] ;
    obo:OBI_0000299 [ a obo:OBI_0000175 ;
            obo:OBI_0001938 [ a obo:OBI_0001931 ;
                    obo:OBI_0001937 5.318725e-05 ] ] .




In [139]:
# Process the machine readable statistical hypothesis test by completing the following SPARQL query that returns the p-value

q = """
PREFIX obo: <http://purl.obolibrary.org/obo/>

SELECT ?pvalue WHERE {
  ?r a obo:STATO_0000304 .
  ?r obo:OBI_0000293 [ a obo:GO_0030350 ] .
  ?r obo:OBI_0000299 [ a obo:OBI_0000175 ;
        obo:OBI_0001938 [ a obo:OBI_0001931 ;
                    obo:OBI_0001937 ?pvalue
  ]].
}
"""
for qs in g.query(q):
    print('p-value=',qs[0])
    
    
    
    

p-value= 5.318725263490542e-05
