# Statistical tests

This Notebook contains code that will repeat the statistical tests that attempted to identify any relationship between endolichenic fungal isolates and their host lichens.

In [80]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

## Pearson's Chi-Squared tests 

A Pearson's Chi-Squared test is commonly used for comparing large categorical datasets.

The folowing code is testing the hypothesis that each of the taxonomic levels of endolichenic fungi are not related to one another.

In [81]:
data = pd.read_csv("../ELF_master_results.csv")
data_sub = pd.DataFrame()
data.fillna("unidentified", inplace=True)
data_sub["isolate_class"] = data["isolate_class"].astype("category").cat.codes
data_sub["isolate_order"] = data["isolate_order"].astype("category").cat.codes
data_sub["isolate_family"] = data["isolate_family"].astype("category").cat.codes
data_sub["isolate_genus"] = data["isolate_genus"].astype("category").cat.codes
data_sub["Host_ID"] = data["Host_ID"].astype("category").cat.codes
data_sub["Host_location"] = data["Host_location"].astype("category").cat.codes
data_sub["Host_Order"] = data["Host_Order"].astype("category").cat.codes
data_sub["Host_Family"] = data["Host_Family"].astype("category").cat.codes
data_sub["Host_Genus"] = data["Host_Genus"].astype("category").cat.codes
data_sub["Host_Species"] = data["Host_Species"].astype("category").cat.codes
data_sub["Photobiont"] = data["Photobiont"].astype("category").cat.codes
data_sub["Growth_form"] = data["Growth_form"].astype("category").cat.codes
data_sub["Macroclimate"] = data["Macroclimate"].astype("category").cat.codes

data_sub.head()

Unnamed: 0,isolate_class,isolate_order,isolate_family,isolate_genus,Host_ID,Host_location,Host_Order,Host_Family,Host_Genus,Host_Species,Photobiont,Growth_form,Macroclimate
0,7,6,14,62,9,2,2,5,14,5,0,1,6
1,4,8,17,31,9,2,2,5,14,5,0,1,6
2,4,8,17,31,9,2,2,5,14,5,0,1,6
3,10,21,45,67,9,2,2,5,14,5,0,1,6
4,4,8,17,31,9,2,2,5,14,5,0,1,6


In [82]:
def pearsons_chi2(df, var1, var2):
    crosstab = pd.crosstab(df[var1], df[var2])
    stat, p, dof, expected = chi2_contingency(crosstab)

    diction = {"Variables": f"{var1}_vs_{var2}",
                "Chi-Squared Statistic": stat, 
                "p-value": p,
                "Degrees of Freedom": dof}
    dict_df = pd.DataFrame(data=diction, index=[0])
    #final_df = pd.concat([final_df, dict_df], ignore_index=True)
    #final_df = final_df.append(dict_df)
    #print(final_df)
    
    print("Chi-Squared Statistic: ", stat)
    print("p-value: ", p)
    print("Degrees of Freedom: ", dof)
    print("Expected Values: ", expected)

    return dict_df

The below will prepare a list of tuples containing each of the elements of isolate and host variable lists, then apply these to the `pearsons_chi2` function.

In [83]:
final_dataframe = pd.DataFrame()

isolate_vars = ["isolate_genus", "isolate_family", "isolate_order", "isolate_class"]
host_vars = ["Host_Species", "Host_Genus", "Host_Family", "Host_Order",
    "Host_location", "Photobiont", "Growth_form", "Macroclimate",]
result = [(x, y) for x in isolate_vars for y in host_vars]

for i in result:
    df = pearsons_chi2(data_sub, i[0], i[1])
    final_dataframe = pd.concat([final_dataframe, df], ignore_index=True)

Chi-Squared Statistic:  2669.7408788418043
p-value:  1.6103986335199512e-21
Degrees of Freedom:  2010
Expected Values:  [[0.02894737 0.00789474 0.04473684 ... 0.01052632 0.01578947 0.01578947]
 [0.28947368 0.07894737 0.44736842 ... 0.10526316 0.15789474 0.15789474]
 [0.43421053 0.11842105 0.67105263 ... 0.15789474 0.23684211 0.23684211]
 ...
 [0.26052632 0.07105263 0.40263158 ... 0.09473684 0.14210526 0.14210526]
 [0.05789474 0.01578947 0.08947368 ... 0.02105263 0.03157895 0.03157895]
 [2.51842105 0.68684211 3.89210526 ... 0.91578947 1.37368421 1.37368421]]
Chi-Squared Statistic:  2436.705483184971
p-value:  4.14612591917959e-37
Degrees of Freedom:  1608
Expected Values:  [[1.84210526e-02 3.68421053e-02 5.26315789e-03 ... 1.57894737e-02
  7.36842105e-02 1.57894737e-02]
 [1.84210526e-01 3.68421053e-01 5.26315789e-02 ... 1.57894737e-01
  7.36842105e-01 1.57894737e-01]
 [2.76315789e-01 5.52631579e-01 7.89473684e-02 ... 2.36842105e-01
  1.10526316e+00 2.36842105e-01]
 ...
 [1.65789474e-01 

In [84]:
final_df.to_csv("pearsons_chi_squared_tests.csv")
display(final_dataframe)

Unnamed: 0,Variables,Chi-Squared Statistic,p-value,Degrees of Freedom
0,isolate_genus_vs_Host_Species,2669.740879,1.610399e-21,2010
1,isolate_genus_vs_Host_Genus,2436.705483,4.146126e-37,1608
2,isolate_genus_vs_Host_Family,1345.802843,2.695727e-12,1005
3,isolate_genus_vs_Host_Order,468.031615,0.01269443,402
4,isolate_genus_vs_Host_location,2051.39546,2.822886e-13,1608
5,isolate_genus_vs_Photobiont,344.841687,0.001054765,268
6,isolate_genus_vs_Growth_form,140.46216,0.3339095,134
7,isolate_genus_vs_Macroclimate,692.800731,7.253408e-11,469
8,isolate_family_vs_Host_Species,1878.458525,4.735137e-20,1350
9,isolate_family_vs_Host_Genus,1694.618736,4.601834e-30,1080


## Prepare data for rarefaction curves on iNEXT

The following text will prepare abundance stats which can be input into the [iNEXT](https://chao.shinyapps.io/iNEXTOnline/) online viewer to generate rarefaction curves of endolichenic fungi genera diversity against lichen host Order.

http://chao.stat.nthu.edu.tw/wordpress/wp-content/uploads/software/iNEXTOnline_UserGuide.pdf

In [85]:
data_sub2 = pd.DataFrame()
data_sub2["isolate_genus"] = data["isolate_genus"]
data_sub2["Host_Order"] = data["Host_Order"]
results = pd.crosstab(data_sub2['isolate_genus'], data_sub2['Host_Order'])

In [87]:
results.to_cvs("iNEXT_input.csv")
display(results)

Host_Order,Gyalectales,Lecanorales,Peltigerales,Pertusariales,Teloschistales,Trichotheliales,unidentified
isolate_genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Absidia,0,0,0,0,0,1,0
Amphirosellinia,0,6,4,0,0,0,0
Anthostomelloides,2,6,5,2,0,0,0
Antrelloides,0,1,0,0,0,0,0
Ascochyta,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...
Trichonectria,0,1,0,0,0,0,0
Umbelopsis,0,3,1,0,0,0,0
Xylaria,0,5,3,1,0,0,0
Xylotumulus,0,2,0,0,0,0,0
