In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 39

In [2]:
# Load data
asa24 = pd.read_csv('ingredient_fiber_carb_weights_nndc_120721.csv')
matches = pd.read_csv('asa_glycan_foods_subject_filtered_qcd_122221.csv')
glycopedia = pd.read_csv('glycopedia_wet_wt_040722.csv')
glycopedia = glycopedia[glycopedia['Simple name'] != 'Fuyu persimmon flesh'] # Monosaccharide data not available for persimmon

In [3]:
matches = matches.rename(columns={'Ingredient description_y':'Ingredient description'})
food_list = matches['Ingredient description'].to_list()

In [4]:
matches['Ingredient description'].nunique()

496

In [5]:
asa24_match = asa24[asa24['Ingredient description'].isin(food_list)]

In [6]:
asa24_200 = pd.merge(asa24_match, matches, on='Ingredient description')

In [7]:
asa24_200.rename(columns={'glycan_food':'Simple name'}, inplace=True)

In [8]:
asa24['Ingredient description'].nunique()

1199

In [9]:
glycopedia = glycopedia.drop(['Food, Varietal, Parts, Form (pear, Bartlet, whole without seeds, fresh) '], axis=1)

In [11]:
asa_glycan = pd.merge(asa24_200, glycopedia, on='Simple name')

In [15]:
cal_per_sub = asa24_200.groupby('UserName')['cal_from_carb'].agg(np.sum) / asa24.groupby('UserName')['cal_from_carb'].agg(np.sum)

In [18]:
cut75 = cal_per_sub.sort_values(ascending=True)[170:]

In [19]:
cut75

UserName
8017    0.750034
6028    0.751897
8006    0.753254
8052    0.753321
9031    0.754190
          ...   
6088    0.988572
5015    0.990869
7091    0.991490
8024    0.997324
5046    0.999388
Name: cal_from_carb, Length: 180, dtype: float64

In [21]:
cut75 = cut75.index.to_list()

In [22]:
carb_75_percent = asa_glycan[asa_glycan.UserName.isin(cut75)]

In [24]:
glycan_consumed = asa_glycan.loc[:, 'Glucose':'Ribose'].astype(float)

In [25]:
glycan_consumed = glycan_consumed.dropna()

In [28]:
glycan_consumed = glycan_consumed.mul(asa_glycan['ingredient_consumed_g'], axis=0)

In [30]:
asa_glycan = asa_glycan.drop(asa_glycan.loc[:, 'Glucose':'Ribose'], axis=1)

In [31]:
asa_glycan = pd.concat([asa_glycan, glycan_consumed], axis=1)

In [32]:
asa_glycan = asa_glycan.drop(columns='Unnamed: 0')

In [34]:
by_subject_recall = asa_glycan.groupby(['UserName','RecallNo'])[['fiber_consumed_g', 'carb_consumed_g', 'cal_consumed', 'Glucose', 'Galactose', 'Fructose', 'Xylose', 'Arabinose', 'Fucose', 'Rhamnose', 'GlcA', 'GalA', 'GlcNAc', 'GalNAc', 'Mannose', 'Allose', 'Ribose']].agg(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,fiber_consumed_g,carb_consumed_g,cal_consumed,Glucose,Galactose,Fructose,Xylose,Arabinose,Fucose,Rhamnose,GlcA,GalA,GlcNAc,GalNAc,Mannose,Allose,Ribose
UserName,RecallNo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5001,2,24.571876,162.182134,1428.047393,83.654474,1.495191,8.440216,1.339542,3.033609,0.111323,0.138670,0.019296,1.963423,0.026375,0.000000,0.383851,0.011356,0.140874
5001,3,5.626388,87.920615,1432.988052,65.380496,1.729390,1.426981,0.718316,1.085795,0.087799,0.043806,0.008809,0.495483,0.011248,0.014456,0.504570,0.000000,0.240711
5001,4,16.840734,186.731659,2028.703168,139.659471,2.433918,3.415244,1.036547,1.796971,0.069246,0.101127,0.026148,1.564876,0.022564,0.014275,0.535401,0.000297,0.093765
5002,2,13.662213,70.222101,1681.594010,25.241327,2.273998,4.656228,0.924506,2.814249,0.192088,0.289766,0.006652,1.458701,0.004698,0.000057,0.396931,0.000000,0.102441
5002,3,6.802463,32.712671,473.092703,11.649118,1.041317,2.767005,0.274468,0.662660,0.070569,0.158718,0.003303,0.963700,0.001544,0.000086,0.451723,0.000000,0.045133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9067,3,19.894000,246.735707,1698.461435,155.875071,17.823698,9.930409,2.468439,2.807767,0.118760,0.114501,0.014168,1.012528,0.006283,0.000100,0.900596,0.026627,0.093524
9067,4,25.322555,251.300861,2287.946167,124.177805,4.414268,13.060763,2.831887,2.423935,0.359975,0.198184,0.023279,2.585555,0.012742,0.001807,1.288912,0.000035,0.073584
9069,2,11.209386,136.614605,2384.800768,60.076187,9.301586,2.576108,0.857100,0.765286,0.125516,0.073791,0.018254,0.863909,0.013018,0.000000,0.449011,0.000000,0.450670
9069,3,22.053427,324.628481,2298.367764,163.299179,3.669706,14.328654,2.102233,1.954997,0.128041,0.211612,0.028202,0.769856,0.101838,0.046322,0.618994,0.000293,0.081916


In [35]:
average_recall_mean_cal = by_subject_recall.groupby('UserName')[['fiber_consumed_g', 'carb_consumed_g', 'cal_consumed', 'Glucose', 'Galactose', 'Fructose', 'Xylose', 'Arabinose', 'Fucose', 'Rhamnose', 'GlcA', 'GalA', 'GlcNAc', 'GalNAc', 'Mannose', 'Allose', 'Ribose']].agg(np.mean)

Unnamed: 0_level_0,fiber_consumed_g,carb_consumed_g,cal_consumed,Glucose,Galactose,Fructose,Xylose,Arabinose,Fucose,Rhamnose,GlcA,GalA,GlcNAc,GalNAc,Mannose,Allose,Ribose
UserName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5001,15.679666,145.611469,1629.912871,96.231480,1.886167,4.427480,1.031468,1.972125,0.089456,0.094534,0.018084,1.341261,0.020062,0.009577,0.474607,0.003884,0.158450
5002,10.209399,60.873800,1428.573842,21.399433,1.659712,3.868750,0.560269,1.385892,0.122152,0.207512,0.008447,1.225127,0.004800,0.000083,0.533331,0.000000,0.139579
5005,5.392515,62.348411,844.644235,44.197600,1.416220,2.387115,0.453311,0.691448,0.044519,0.075211,0.010435,0.737761,0.009533,0.002788,0.691849,0.000056,0.150510
5006,27.081501,161.456345,2102.082336,71.588498,7.750320,9.237649,1.478130,2.404249,0.272456,0.255789,0.018551,3.149565,0.010522,0.001606,0.750376,0.007202,0.268489
5007,10.875780,130.813319,987.526436,61.417807,2.511428,2.688419,0.962227,1.728356,0.117555,0.121788,0.028938,0.497769,0.012215,0.004448,1.963867,0.000726,0.097799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9062,23.972730,125.075589,1251.886975,65.012919,1.735868,7.725265,1.268340,2.186905,0.146231,0.156422,0.011130,1.969182,0.019490,0.013414,0.598636,0.000276,0.201810
9065,26.207626,149.104296,1321.279735,75.131086,3.020815,12.504472,1.365940,3.262148,0.295460,0.240637,0.010969,2.394403,0.041190,0.030815,1.176532,0.006856,0.146350
9066,20.927039,179.285590,1467.641122,101.528012,5.647021,4.352449,2.891499,2.633941,0.158431,0.122006,0.032634,0.911368,0.010486,0.001366,1.226310,0.000431,0.106002
9067,26.514018,244.395739,1787.700201,129.567942,11.637730,10.542463,2.717884,3.675936,0.267449,0.303625,0.014863,3.037962,0.006609,0.000671,1.198996,0.008888,0.102734


In [36]:
average_recall_mean_cal_75 = average_recall_mean_cal.reset_index()

In [37]:
average_recall_mean_cal_75 = average_recall_mean_cal_75[average_recall_mean_cal_75.UserName.isin(cut75)]

In [35]:
average_recall_mean_cal_75.to_csv('asa_glycan_energy_unadjusted_040722.csv', index=None)

In [39]:
average_recall_mean = average_recall_mean_cal.drop(columns=['cal_consumed'])
average_recall_mean = average_recall_mean.iloc[:,0:].div(average_recall_mean_cal['cal_consumed'], axis=0) 
average_recall_mean = average_recall_mean.mul(1000)
average_recall_mean = average_recall_mean.reset_index()
average_recall_mean

Unnamed: 0,UserName,fiber_consumed_g,carb_consumed_g,Glucose,Galactose,Fructose,Xylose,Arabinose,Fucose,Rhamnose,GlcA,GalA,GlcNAc,GalNAc,Mannose,Allose,Ribose
0,5001,9.619941,89.336965,59.040874,1.157219,2.716391,0.632837,1.209957,0.054884,0.058000,0.011095,0.822903,0.012309,0.005876,0.291186,0.002383,0.097214
1,5002,7.146567,42.611588,14.979577,1.161797,2.708120,0.392188,0.970123,0.085506,0.145258,0.005913,0.857588,0.003360,0.000058,0.373331,0.000000,0.097705
2,5005,6.384363,73.816180,52.326883,1.676706,2.826178,0.536688,0.818627,0.052707,0.089045,0.012354,0.873458,0.011287,0.003301,0.819101,0.000066,0.178194
3,5006,12.883178,76.807812,34.055992,3.686972,4.394523,0.703174,1.143747,0.129612,0.121684,0.008825,1.498307,0.005005,0.000764,0.356968,0.003426,0.127725
4,5007,11.013153,132.465638,62.193583,2.543150,2.722377,0.974381,1.750187,0.119039,0.123326,0.029303,0.504056,0.012370,0.004504,1.988673,0.000736,0.099034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,9062,19.149277,99.909649,51.931940,1.386602,6.170897,1.013143,1.746887,0.116809,0.124949,0.008890,1.572971,0.015569,0.010715,0.478187,0.000220,0.161205
346,9065,19.835032,112.848394,56.862361,2.286280,9.463910,1.033801,2.468931,0.223617,0.182124,0.008302,1.812185,0.031174,0.023322,0.890449,0.005189,0.110764
347,9066,14.258962,122.159013,69.177683,3.847685,2.965609,1.970168,1.794677,0.107949,0.083131,0.022236,0.620975,0.007145,0.000931,0.835565,0.000294,0.072226
348,9067,14.831356,136.709578,72.477444,6.509889,5.897221,1.520324,2.056237,0.149605,0.169841,0.008314,1.699369,0.003697,0.000375,0.670692,0.004971,0.057467


In [40]:
average_recall_mean_75 = average_recall_mean[average_recall_mean.UserName.isin(cut75)]

In [41]:
average_recall_mean_cal = average_recall_mean_cal.reset_index()

In [42]:
cal_correct = average_recall_mean_cal['Glucose'] / average_recall_mean['Glucose']

In [43]:
cal_correct.index = average_recall_mean['UserName']

In [44]:
cal_correct

UserName
5001    1.629913
5002    1.428574
5005    0.844644
5006    2.102082
5007    0.987526
          ...   
9062    1.251887
9065    1.321280
9066    1.467641
9067    1.787700
9069    2.799507
Name: Glucose, Length: 350, dtype: float64

In [45]:
asa_glycan['idx'] = asa_glycan.index

In [47]:
ingred_all = asa_glycan.groupby(['UserName', 'FoodCode','idx', 'Food_Description', 'Ingredient code', 'WWEIA Category number', 'WWEIA Category description', 'Ingredient description', 'FoodAmt', 'FoodNum','RecallNo', 'ingredient_consumed_g', 'Fiber (g)', 'Carbohydrate (g)', 'Energy (kcal)', 'cal_consumed', 'cal_from_carb', 'Simple name', 'glycan_food_class'], dropna=True)[[ 'fiber_consumed_g', 'carb_consumed_g', 'Glucose', 'Galactose', 'Fructose', 'Arabinose', 'Xylose', 'Fucose', 'Rhamnose', 'GlcA', 'GalA', 'GlcNAc', 'GalNAc', 'Mannose', 'Allose', 'Ribose']].agg(np.sum)

In [50]:
unadjusted_intake = ingred_all.reset_index()
unadjusted_intake = unadjusted_intake[unadjusted_intake.UserName.isin(cut75)]

In [53]:
unadjusted_intake['Simple name'].nunique()

241

In [48]:
unadjusted_intake['Ingredient code'].nunique()

465

In [59]:
supp_table_1 = unadjusted_intake.drop_duplicates(subset = ['Ingredient description'])

In [62]:
supp_table_1.to_csv('supp_table_1.csv')

In [49]:
unadjusted_intake.to_csv('all_items_unadjusted_041122.csv', index=None)

In [50]:
ingred_all = ingred_all.div(cal_correct, axis = 0, level = 0)

In [51]:
ingred_all = ingred_all.reset_index()
ingred_all = ingred_all[ingred_all.UserName.isin(cut75)]

In [53]:
ingred_all = ingred_all.reset_index()
ingred_all = ingred_all[ingred_all.UserName.isin(cut75)]
ingred_all = ingred_all.drop(columns=['index'])
#ingred_all = pd.merge(ingred_all, wweia, on = 'WWEIA Category number')

In [54]:
#ingred_all= ingred_all.drop(columns=['level_0'])

In [55]:
ingred_all.to_csv('all_items_cal_adjusted_041122.csv', index=None)