In [3]:
import pandas as pd
import numpy as np

# Analysis of significant terms

In [4]:
path = 'C:\\Users\\jenny\\Documents\\NTNU\\Master\\my_supplementary_data\\analysis\\'
results_COG = pd.read_csv(path + 'fisher\\fisher_COG.csv', sep = ';', index_col = 0)
results_GO = pd.read_csv(path + 'fisher\\fisher_GO.csv', sep = ';', index_col = 0)
results_KO = pd.read_csv(path + 'fisher\\fisher_KO.csv', sep = ';', index_col = 0)

In [5]:
# creating result column which says "negative" or "positive" if meeting conditions of pVal < 0.01, ||odds ratio|| > 1
def outcome(odds, pVal):
    if pVal > 0.01: return np.nan
    elif odds < -1: return 'negative'
    elif odds > 1: return 'positive'
    else: return np.nan

results_COG['res'] = results_COG.apply(lambda x: outcome(x['odds_ratio_log10_corr'], x['fdr_p-value']), axis=1)
results_GO['res'] = results_GO.apply(lambda x: outcome(x['odds_ratio_log10_corr'], x['fdr_p-value']), axis=1)
results_KO['res'] = results_KO.apply(lambda x: outcome(x['odds_ratio_log10_corr'], x['fdr_p-value']), axis=1)

## Creating four categories of terms
- Singificantly/exclusively associated with gram negative/positive
 - Significant: fold change ||fc||>1
 - Exclusive: fold change ||fc||=3
 - Positive: fold change fc > 0
 - Negative: fold change fc < 0

In [6]:
# any significant hits, meaning "positive" or "negative" in created "res" (result) column
all_neg_COG = results_COG.loc[results_COG['res'] == 'negative']
all_pos_COG = results_COG.loc[results_COG['res'] == 'positive']
all_neg_GO = results_GO.loc[results_GO['res'] == 'negative']
all_pos_GO = results_GO.loc[results_GO['res'] == 'positive']
all_neg_KO = results_KO.loc[results_KO['res'] == 'negative']
all_pos_KO = results_KO.loc[results_KO['res'] == 'positive']

# exlusive, meaning odds ratio (i.e. fold change) at the set threshold of -3 (gram negative) or 3 (gram positive)
only_pos_COG = all_pos_COG.loc[all_pos_COG['odds_ratio_log10_corr'] == 3]
only_neg_COG = all_neg_COG.loc[all_neg_COG['odds_ratio_log10_corr'] == -3]
only_pos_GO = all_pos_GO.loc[all_pos_GO['odds_ratio_log10_corr'] == 3]
only_neg_GO = all_neg_GO.loc[all_neg_GO['odds_ratio_log10_corr'] == -3]
only_pos_KO = all_pos_KO.loc[all_pos_KO['odds_ratio_log10_corr'] == 3]
only_neg_KO = all_neg_KO.loc[all_neg_KO['odds_ratio_log10_corr'] == -3]

print('in (5%,95%):\tCOG:',str(len(results_COG))+'\t\tKO:',str(len(results_KO))+'\t\tGO:',str(len(results_GO)))
print('#-------------------------------------------------------------------------\nALL SIGNIFICANT TERMS')
print('total:\t\tCOG:',str(len(all_pos_COG)+len(all_neg_COG))+'\t\tKO:',str(len(all_pos_KO)+len(all_neg_KO))+'\t\tGO:',str(len(all_pos_GO)+len(all_neg_GO)))
print('positive:\tCOG:',str(len(all_pos_COG))+'\t\tKO:',str(len(all_pos_KO))+'\t\t\tGO:',str(len(all_pos_GO)))
print('negative:\tCOG:',str(len(all_neg_COG))+'\t\tKO:',str(len(all_neg_KO))+'\t\tGO:',str(len(all_neg_GO)))
print('#-------------------------------------------------------------------------\nEXCLUSIVE TO ONLY ONE GRAM ATTRIBUTE')
print('total:\t\tCOG:',str(len(only_pos_COG)+len(only_neg_COG))+'\t\tKO:',str(len(all_pos_KO)+len(only_neg_KO))+'\t\t\tGO:',str(len(only_pos_GO)+len(all_neg_GO)))
print('positive:\tCOG:',str(len(only_pos_COG))+'\t\t\tKO:',str(len(only_pos_KO))+'\t\t\tGO:',str(len(only_pos_GO)))
print('negative:\tCOG:',str(len(only_neg_COG))+'\t\tKO:',str(len(only_neg_KO))+'\t\t\tGO:',str(len(only_neg_GO)))

in (5%,95%):	COG: 3653		KO: 4718		GO: 5100
#-------------------------------------------------------------------------
ALL SIGNIFICANT TERMS
total:		COG: 884		KO: 1514		GO: 2046
positive:	COG: 232		KO: 480			GO: 758
negative:	COG: 652		KO: 1034		GO: 1288
#-------------------------------------------------------------------------
EXCLUSIVE TO ONLY ONE GRAM ATTRIBUTE
total:		COG: 273		KO: 920			GO: 1382
positive:	COG: 9			KO: 56			GO: 94
negative:	COG: 264		KO: 440			GO: 858


## GO, exclusives

In [5]:
# GO terms, only associated with positive gram stains (i.e. odds ratio 3)
# prints out list which was submitted to ReViGO
for i in only_pos_GO['GO']:
    print(i)

GO:0000935
GO:0003881
GO:0004512
GO:0004739
GO:0006204
GO:0006360
GO:0006363
GO:0009398
GO:0010127
GO:0010499
GO:0015970
GO:0016161
GO:0016995
GO:0018272
GO:0018352
GO:0018454
GO:0019279
GO:0019343
GO:0019367
GO:0022623
GO:0022624
GO:0030112
GO:0030350
GO:0030638
GO:0030639
GO:0030812
GO:0031073
GO:0031515
GO:0032155
GO:0033847
GO:0033942
GO:0033973
GO:0034036
GO:0034039
GO:0035336
GO:0035496
GO:0035551
GO:0040010
GO:0042025
GO:0042603
GO:0042783
GO:0043457
GO:0043743
GO:0043866
GO:0044044
GO:0044046
GO:0044315
GO:0044457
GO:0045820
GO:0046109
GO:0046444
GO:0047260
GO:0051063
GO:0051198
GO:0051810
GO:0052046
GO:0052048
GO:0052059
GO:0052083
GO:0052155
GO:0052278
GO:0052294
GO:0052385
GO:0052550
GO:0052567
GO:0052573
GO:0052657
GO:0052698
GO:0052699
GO:0052701
GO:0052703
GO:0052704
GO:0052708
GO:0060542
GO:0060543
GO:0061672
GO:0070490
GO:0070628
GO:0070967
GO:0071578
GO:0071769
GO:0075141
GO:0097100
GO:0097691
GO:0140035
GO:1901110
GO:1901112
GO:1902074
GO:1902075
GO:1903561
GO:1903579

In [6]:
# GO terms, only associated with negative gram stains (i.e. odds ratio -3)
# prints out list which was submitted to ReViGO
for i in only_neg_GO['GO']:
    print(i)

GO:0000006
GO:0000099
GO:0000229
GO:0000310
GO:0000715
GO:0000716
GO:0000719
GO:0000738
GO:0000786
GO:0000787
GO:0000789
GO:0000819
GO:0001073
GO:0001123
GO:0001140
GO:0001530
GO:0001680
GO:0001716
GO:0001887
GO:0002048
GO:0002049
GO:0002101
GO:0002130
GO:0002131
GO:0002132
GO:0002134
GO:0002136
GO:0002188
GO:0002190
GO:0002192
GO:0002196
GO:0002935
GO:0002939
GO:0002953
GO:0003681
GO:0003727
GO:0003756
GO:0003841
GO:0003842
GO:0003848
GO:0003857
GO:0003904
GO:0003919
GO:0003957
GO:0003960
GO:0003961
GO:0003985
GO:0003987
GO:0004021
GO:0004044
GO:0004057
GO:0004069
GO:0004071
GO:0004076
GO:0004108
GO:0004109
GO:0004121
GO:0004125
GO:0004149
GO:0004150
GO:0004158
GO:0004165
GO:0004309
GO:0004345
GO:0004355
GO:0004366
GO:0004368
GO:0004397
GO:0004400
GO:0004412
GO:0004455
GO:0004456
GO:0004458
GO:0004471
GO:0004473
GO:0004486
GO:0004505
GO:0004588
GO:0004609
GO:0004632
GO:0004635
GO:0004636
GO:0004637
GO:0004640
GO:0004643
GO:0004644
GO:0004733
GO:0004756
GO:0004779
GO:0004781
GO:0004783

## GO, all significant

In [7]:
# GO terms, all associated with positive gram stains (i.e. odds ratio over 1)
# prints out list which was submitted to ReViGO, set to "higher value better"
# for r in all_pos_GO.index:
#     print(all_pos_GO.at[r,'GO'], all_pos_GO.at[r,'odds_ratio_log10_corr'])

for r in all_pos_GO.index:
    print(all_pos_GO.at[r,'GO'])

GO:0000009
GO:0000018
GO:0000023
GO:0000026
GO:0000030
GO:0000034
GO:0000062
GO:0000121
GO:0000150
GO:0000286
GO:0000309
GO:0000413
GO:0000451
GO:0000453
GO:0000726
GO:0000910
GO:0000921
GO:0000935
GO:0001666
GO:0001678
GO:0001933
GO:0001968
GO:0002135
GO:0002237
GO:0002682
GO:0002683
GO:0002684
GO:0002791
GO:0003678
GO:0003755
GO:0003825
GO:0003844
GO:0003855
GO:0003864
GO:0003877
GO:0003881
GO:0003882
GO:0003905
GO:0003906
GO:0003910
GO:0003938
GO:0003951
GO:0003952
GO:0003955
GO:0003977
GO:0003984
GO:0003993
GO:0003996
GO:0003999
GO:0004001
GO:0004038
GO:0004085
GO:0004106
GO:0004122
GO:0004143
GO:0004190
GO:0004301
GO:0004321
GO:0004326
GO:0004340
GO:0004356
GO:0004376
GO:0004377
GO:0004396
GO:0004413
GO:0004418
GO:0004427
GO:0004512
GO:0004515
GO:0004517
GO:0004582
GO:0004604
GO:0004619
GO:0004633
GO:0004645
GO:0004658
GO:0004672
GO:0004674
GO:0004739
GO:0004743
GO:0004749
GO:0004760
GO:0004764
GO:0004799
GO:0004820
GO:0004834
GO:0004844
GO:0005355
GO:0005388
GO:0005518
GO:0005618

In [8]:
# GO terms, all associated with negtive gram stains (i.e. odds ratio below -1)
# prints out list which was submitted to ReViGO, set to "lower value better"
# for r in all_neg_GO.index:
#     print(all_neg_GO.at[r,'GO'], all_neg_GO.at[r,'odds_ratio_log10_corr'])

for r in all_neg_GO.index:
    print(all_neg_GO.at[r,'GO'])

GO:0000006
GO:0000014
GO:0000015
GO:0000099
GO:0000217
GO:0000229
GO:0000310
GO:0000372
GO:0000375
GO:0000376
GO:0000400
GO:0000469
GO:0000478
GO:0000715
GO:0000716
GO:0000719
GO:0000738
GO:0000786
GO:0000787
GO:0000789
GO:0000819
GO:0000900
GO:0000985
GO:0001046
GO:0001073
GO:0001121
GO:0001123
GO:0001125
GO:0001131
GO:0001140
GO:0001141
GO:0001530
GO:0001680
GO:0001716
GO:0001731
GO:0001822
GO:0001887
GO:0001889
GO:0002048
GO:0002049
GO:0002101
GO:0002130
GO:0002131
GO:0002132
GO:0002134
GO:0002136
GO:0002183
GO:0002188
GO:0002190
GO:0002192
GO:0002196
GO:0002437
GO:0002439
GO:0002544
GO:0002935
GO:0002937
GO:0002939
GO:0002953
GO:0003006
GO:0003681
GO:0003727
GO:0003756
GO:0003841
GO:0003842
GO:0003848
GO:0003857
GO:0003883
GO:0003904
GO:0003919
GO:0003957
GO:0003960
GO:0003961
GO:0003985
GO:0003987
GO:0004021
GO:0004033
GO:0004044
GO:0004047
GO:0004057
GO:0004069
GO:0004071
GO:0004076
GO:0004077
GO:0004108
GO:0004109
GO:0004121
GO:0004125
GO:0004149
GO:0004150
GO:0004158
GO:0004165

## COG

In [11]:
# merging the COG dfs with the overviews of COGs from NCBI
#'cog-20.def' with COG term definitions, category abbreviations, etc.
#'fun-20' with functional category names

COG_overview = pd.read_csv(path + 'cog_term\\cog-20.def.tab', sep = '\t', names = ['COG','category','name','gene','pathway','pubmed','prot db id'])
COG_category = pd.read_csv(path + 'cog_term\\fun-20.tab', sep = '\t', names = ['category','rgb','description'])
COG_overview = pd.merge(COG_overview, COG_category, on = 'category', how = 'outer')

only_neg_COG = pd.merge(only_neg_COG, COG_overview, on = 'COG', how = 'inner')
only_pos_COG = pd.merge(only_pos_COG, COG_overview, on = 'COG', how = 'inner')
all_neg_COG = pd.merge(all_neg_COG, COG_overview, on = 'COG', how = 'inner')
all_pos_COG = pd.merge(all_pos_COG, COG_overview, on = 'COG', how = 'inner')

In [13]:
# saving for visualisation in Tableau
all_neg_COG[['category','description','COG','name','odds_ratio_log10_corr']].value_counts().to_csv(path + 'cog_term\\all_neg_COG.csv', sep = ';')
all_pos_COG[['category','description','COG','name','odds_ratio_log10_corr']].value_counts().to_csv(path + 'cog_term\\all_pos_COG.csv', sep = ';')
only_neg_COG[['category','description','COG','name','odds_ratio_log10_corr']].value_counts().to_csv(path + 'cog_term\\only_neg_COG.csv', sep = ';')
only_pos_COG[['category','description','COG','name','odds_ratio_log10_corr']].value_counts().to_csv(path + 'cog_term\\only_pos_COG.csv', sep = ';')

## KO, all significant

In [14]:
# KO terms, all associated with negtive gram stains (i.e. odds ratio below -1)
# prints out list submitted to KEGG Mapper
for r in all_neg_KO.index:
    print(all_neg_KO.at[r,'KO'])

K00029
K00032
K00035
K00043
K00050
K00088
K00097
K00114
K00117
K00154
K00184
K00185
K00220
K00228
K00253
K00293
K00311
K00329
K00346
K00347
K00348
K00349
K00350
K00351
K00356
K00376
K00404
K00405
K00406
K00407
K00410
K00411
K00412
K00413
K00424
K00428
K00569
K00570
K00643
K00647
K00673
K00677
K00684
K00748
K00769
K00795
K00808
K00809
K00813
K00822
K00840
K00883
K00892
K00906
K00912
K00979
K00992
K00998
K01004
K01058
K01067
K01070
K01120
K01141
K01146
K01147
K01150
K01166
K01169
K01175
K01241
K01283
K01358
K01414
K01444
K01450
K01452
K01458
K01459
K01483
K01484
K01498
K01573
K01578
K01627
K01663
K01682
K01716
K01766
K01800
K01801
K01825
K01851
K01887
K01894
K01920
K01960
K01974
K01991
K01993
K02014
K02024
K02066
K02067
K02164
K02167
K02193
K02194
K02195
K02196
K02197
K02198
K02199
K02225
K02258
K02280
K02281
K02297
K02298
K02300
K02305
K02336
K02339
K02344
K02386
K02391
K02392
K02393
K02394
K02399
K02402
K02403
K02427
K02439
K02441
K02442
K02448
K02452
K02454
K02455
K02457
K02458
K02460

In [15]:
# KO terms, all associated with positive gram stains (i.e. odds ratio above 1)
# prints out list submitted to KEGG Mapper
for r in all_pos_KO.index:
    print(all_pos_KO.at[r,'KO'])

K00103
K00153
K00226
K00231
K00232
K00271
K00301
K00360
K00435
K00491
K00561
K00687
K00728
K00756
K00761
K00791
K00805
K00841
K00872
K00886
K00897
K00929
K00936
K00938
K00999
K01227
K01261
K01269
K01274
K01308
K01389
K01421
K01567
K01598
K01616
K01634
K01844
K01876
K01906
K01926
K01994
K02086
K02103
K02109
K02171
K02236
K02237
K02239
K02240
K02242
K02243
K02244
K02245
K02248
K02356
K02434
K02490
K02491
K02497
K02499
K02530
K02798
K02799
K02802
K02808
K02825
K02827
K02828
K02829
K02859
K02862
K03048
K03090
K03091
K03095
K03290
K03339
K03346
K03367
K03402
K03431
K03432
K03433
K03436
K03480
K03483
K03488
K03489
K03491
K03500
K03565
K03629
K03693
K03696
K03697
K03700
K03705
K03706
K03708
K03713
K03727
K03739
K03740
K03763
K03887
K03888
K03889
K03890
K03891
K03930
K04086
K04488
K04766
K04769
K04780
K05311
K05342
K05346
K05362
K05363
K05364
K05518
K05522
K05576
K05578
K05602
K05808
K05822
K05823
K05896
K05937
K06012
K06024
K06198
K06215
K06283
K06285
K06286
K06287
K06294
K06295
K06297
K06298

## KO, exclusives

In [16]:
# KO terms, all exclusively associated with negtive gram stains (i.e. odds ratio of -3)
# prints out list submitted to KEGG Mapper
for r in only_neg_KO.index:
    print(only_neg_KO.at[r,'KO'])

K00035
K00043
K00050
K00220
K00293
K00346
K00407
K00410
K00570
K00673
K00840
K00906
K00998
K01058
K01067
K01120
K01141
K01146
K01150
K01169
K01283
K01458
K01459
K01484
K01498
K01663
K01716
K01766
K01825
K01851
K01991
K02024
K02196
K02258
K02305
K02344
K02439
K02442
K02452
K02460
K02462
K02463
K02464
K02495
K02504
K02505
K02560
K02571
K02623
K02625
K02657
K02670
K02676
K02679
K02680
K02742
K02856
K03113
K03184
K03192
K03195
K03196
K03197
K03198
K03199
K03200
K03203
K03204
K03214
K03219
K03222
K03226
K03227
K03228
K03229
K03230
K03472
K03473
K03477
K03548
K03573
K03576
K03607
K03632
K03633
K03645
K03670
K03674
K03683
K03757
K03759
K03760
K03764
K03773
K03796
K03804
K03807
K03812
K03815
K03835
K03840
K03863
K03941
K03943
K03974
K04067
K04080
K04338
K04765
K04770
K04775
K05351
K05368
K05526
K05590
K05597
K05603
K05778
K05779
K05785
K05803
K05805
K05809
K05851
K05886
K05952
K05997
K06006
K06073
K06076
K06078
K06125
K06143
K06159
K06169
K06175
K06181
K06214
K06447
K06598
K06601
K06602
K06866

In [17]:
# KO terms, all exclusively associated with positive gram stains (i.e. odds ratio of 3)
# prints out list submitted to KEGG Mapper
for r in only_pos_KO.index:
    print(only_pos_KO.at[r,'KO'])

K00271
K00360
K00687
K00999
K01389
K02829
K03740
K05362
K05518
K05937
K06607
K07008
K08168
K10005
K10006
K10007
K10008
K10240
K11050
K11533
K11608
K11610
K11611
K11622
K11705
K12429
K12553
K13570
K13571
K13678
K15733
K16019
K16237
K16323
K16645
K17329
K17331
K17829
K18230
K18232
K18370
K18568
K18572
K18662
K18958
K19971
K19975
K19976
K20469
K20814
K21148
K21169
K21466
K21473
K21962
K22476


In [11]:
results_KO[results_KO['KO']=='K00647']

Unnamed: 0,KO,keys,"contingency_ph,nh,pl,nl",odds_ratio,odds_ratio_log10,odds_ratio_log10_corr,p-value,reject_null,fdr_p-value,outcome,res
333,K00647,"{2058, 2581, 2080, 3106, 1571, 1572, 3113, 311...","[[1, 132], [221, 221]]",0.007576,-2.120574,-2.120574,5.34337e-32,True,9.26839e-31,negative,negative
