In [None]:
# Print the versions of different libraries being used
import scipy as sc
import statsmodels as st
import mlxtend as ml
import numpy as np
import pandas as pd

print("Version of Pandas: " + pd.__version__)
print("Version of scipy: " + sc.__version__)
print("Version of statsmodels: " + st.__version__)
print("Version of mlxtend: " + ml.__version__)
print("Version of numpy: " + np.__version__)


from mlxtend.frequent_patterns import fpgrowth, association_rules
import scipy.stats as stats
from statsmodels.stats import multitest as mt
import gradio as gr


# List of Abbreviations

| Latin Abbreviation | Full Written Latin Species Name        |
|--------------------|---------------------------------------|
| DacGlom            | Dactylis glomerata                   |
| PlaLanc            | Plantago lanceolata                  |
| MedLupu            | Medicago lupulina                    |
| GalAlbu            | Galium album                         |
| TriPrat            | Trifolium pratensis                  |
| PoaTriv            | Poa trivialis                        |
| TarRude            | Taraxacum sect. Ruderali             |
| LeuIrcu            | Leucanthemum ircutianum              |
| RanBulb            | Ranunculus bulbosus                  |
| AntOdor            | Anthoxanthum odoratum                |
| FesRubr            | Festuca rubra                        |
| CerHolo            | Cerastium holosteoides               |
| CerGlom            | Cerastium glomeratum                 |
| ArrElat            | Arrhenatherum elatius                |
| KnaArve            | Knautia arvensis                     |
| LotCorn            | Lotus corniculatus                   |
| RosSpec            | Rosa specie                          |
| SanMino            | Sanguisorba minor                    |
| TriDubi            | Trifolium dubium                     |
| PriVeri            | Primula veris                        |
| VicAngu            | Vicia angustifolia                   |
| VioHirt            | Viola hirta                          |
| DauCaro            | Daucus carota                        |

In [None]:
def getrules(file,separator,correction_methods,field, minsupport, min_threshold, significant_pvalue, metric, n_spec,n_rules):

    df = pd.read_csv(file.name,sep=separator,  encoding='utf-8')
    
    remove_species = ["FesPrat", "LolPere", "PhlPrat", "PoaTriv", "BroErec", "TriFlav", "PlaLanc"]
    n = df.area.nunique()
    cnt = df.variable.value_counts() / n
    df = df[~df.variable.isin(remove_species)]

    # Get the list of species present in the field "A10_16"
    species_on_field = df[df.area == field].variable.values

    # Remove species from the DataFrame that occur less frequently (below a threshold) in the dataset
    df = df[~df.variable.isin(cnt[(cnt < minsupport)].index.values.tolist())]

    # Create a cross-tabulation between 'area' and 'variable' to represent the dataset as a binary matrix
    df = pd.crosstab(df.area, df.variable)

    # Apply frequent pattern growth algorithm to find frequent itemsets
    frequent_itemsets = fpgrowth(df, min_support=minsupport, use_colnames=True, max_len=20)

    # Apply association rule mining to generate association rules
    assoc = association_rules(frequent_itemsets, metric=metric, min_threshold=min_threshold, support_only=False)

    # Filter the rules: Keep only those where all species in the antecedents are present in the field
    assoc["antecedent_in_field"] = assoc.apply(lambda x: x.antecedents.issubset(species_on_field), axis=1)
    assoc = assoc[assoc.antecedent_in_field == True]

    # Filter the rules further: Keep only those where no species from the 'remove_cons' list is present in the consequents
    remove_cons = species_on_field
    assoc["consequent_not_in_field"] = assoc.apply(lambda x: x.consequents.isdisjoint(remove_cons), axis=1)
    assoc = assoc[assoc.consequent_not_in_field == True]

    # Perform additional calculations on the DataFrame to prepare for statistical tests
    assoc["size_antecedents"] = assoc.apply(lambda x: len(x.antecedents), axis=1)
    assoc["size_consequents"] = assoc.apply(lambda x: len(x.consequents), axis=1)
    assoc['n1x'] = assoc['antecedent support'] * n
    assoc['nx1'] = assoc['consequent support'] * n
    assoc['n11'] = assoc['support'] * n
    assoc['n0x'] = n - assoc['n1x']
    assoc['nx0'] = n - assoc['nx1']
    assoc['n10'] = assoc["n1x"] - assoc['n11']
    assoc['n01'] = assoc["nx1"] - assoc['n11']
    assoc['n00'] = assoc['n0x'] - assoc['n01']

    # Reduce calculations of Fisher's exact test to speed up the process
    df2 = assoc[["n11", "n10", "n01", "n00"]].drop_duplicates().copy()
    df2["pval"] = df2.apply(lambda x: stats.fisher_exact([[x.n11, x.n10], [x.n01, x.n00]], "two-sided")[1], axis=1)
    assoc = assoc.merge(df2)

    # Perform multiple testing correction on the p-values using various methods
    assoc.iloc[:, 9:19] = assoc.iloc[:, 9:19].astype(int)
    assoc = assoc.sort_values("pval")

    assoc["p_corr"] = mt.multipletests(assoc.pval, significant_pvalue, method="fdr_bh", is_sorted=True)[1]

    # Calculate the expected number of species in the consequents based on confidence
    assoc["expected_species"] = assoc["size_consequents"] * assoc["confidence"]

    # Print the final dimensions of the DataFrame after all calculations
    print(f"Datensatzdimension nach allen Berechnungen: {assoc.shape}")

    # Save the results to a CSV file
    # assoc.to_csv("Saatgutmischungen_10_16_schedenveg_0.12S_0.2C.csv", index=False)

    # Select the significant rules based on adjusted p-value and other criteria, and save them to a CSV file
    significant = assoc[(assoc.p_corr < significant_pvalue) & (assoc.expected_species > n_spec)].sort_values(
        ["expected_species", "p_corr", "size_antecedents"], ascending=[False, False, True]
    )
    significant = significant.groupby("consequents").first()
    significant = significant.sort_values(
        ["expected_species", "p_corr", "size_antecedents"], ascending=[False, False, True]
    ).reset_index()

    # Print the number of significant rules after correction
    print(f"Anzahl aller signifikanten p-Wert korrigierten Regeln: {significant.shape[0]}")

    significant[["support","confidence","expected_species","pval","p_corr"]] = significant[["support","confidence","expected_species","pval","p_corr"]].apply(lambda x:np.round(x,4))

    # Select the top 10 significant rules and perform set union operation on their antecedents and consequents
    cp = significant[["antecedents", "consequents", "support", "confidence", "pval", "p_corr", "expected_species", "lift"]][:int(n_rules)].copy()
    return cp


# Create Gradio interface
iface = gr.Interface(
    fn=getrules,
    inputs=[
        gr.File(type="file",file_types=[".csv"], file_count="single"),
        gr.Text(label="Separator", value=" "),
        gr.Dropdown(
            choices=["bonferroni", "sidak", "holm", "holm-sidak", "simes-hochberg", "fdr_bh", "fdr_tsbh", "fdr_by", "fdr_tsbky", "hommel"],
            label="Choose a correction method",
            value="fdr_bh"
        ),
        gr.Text(
            label="Select a field",
            value="A10_16"
        ),
        gr.Number(
            label="Minimum Support",
            value=0.12,
            step=0.01
        ),
        gr.Number(
            label="Minimum Threshold of Metric",
            value=0.20,
            step=0.01
        ),
        gr.Number(
            label="Significant P-Value",
            value=0.05,
            step=0.01
        ),
        gr.Dropdown(
            choices=["support","lift","confidence","leverage","conviction","zhangs_metric"],
            label="Metric",
            value="confidence"
        ),
        gr.Number(
            label="Least Expected Number of Species",
            value=2,
            step=1
        ),
        gr.Number(
            label="Number of returned significant rules",
            value=10,
            step=1
        )
    ],
    outputs="dataframe",
    live=False,
    title="Association Rule Mining of Grassland with p-value correction",
    description="Find association rules with the specified parameters.",
    allow_flagging='never'
)

# Launch the Gradio interface
iface.launch(debug=True, server_name="localhost", server_port=7862)

# This code sets up the Gradio interface with an output function called download_rules that saves the DataFrame to a CSV file and returns the file path. Users can click the "Download CSV" button to download the generated rules as a CSV file. Make sure to replace 'your_module' with the actual name of the module where your getrules function is defined.

