## Import Packages

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import rcParams
from sklearn.preprocessing import MinMaxScaler
import matplotlib
import numpy as np
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


## Import needed Files

In [2]:
quant = pd.read_csv("Data/cap_manu/input/40_varieties_final_quant_no_cap_or_di.csv")
quant = quant.rename(columns={c: c.split('_')[2] for c in quant.columns if 'Peak area' in c})
quant_area = quant[[c for c in quant.columns if c.startswith('S')] + ['row ID']]

sirius = pd.read_csv("Data/cap_manu/input/canopus_compound_summary_40_varieties.tsv", usecols = ['featureId','NPC#class'], sep= '\t')

sample_names = pd.read_excel('Data/cap_manu/input/sample_names.xlsx')

EC50 = pd.read_excel('Data/cap_manu/input/chili_EC50_new.xlsx')

In [None]:
#merging SIRIUS and Quant table to obtain NPC_Class for prediction
quant_area.rename(columns={'row ID':'featureId'}, inplace=True)
table= pd.merge(quant_area, sirius, on='featureId', how='left')
table

## Analysis with only one class

Select the compound class of interest

In [4]:
# Prompt the user to input specific row IDs
selected_row_ids = input('Enter NPC#Class' )
# Capsaicins and Capsaicinoids
# Reduce the DataFrame to the selected row IDs
reduced_table = table[table['NPC#class'].astype(str).str.contains(selected_row_ids)]

In [None]:
#remove NPC class from table
final_table =reduced_table[[c for c in reduced_table.columns if c.startswith('S')]]
final_table = final_table[sorted(final_table.columns, key=lambda c: int(c[1:]))]

#Use next line if you want common name
#final_table.columns = final_table.columns.map(sample_names.set_index('Sample')['Common Name'])

final_table

Create a sum intesity of all feature for a given SIRIUS class

In [6]:
# create a row of sum intensity (axis=1 is row)
sum_intensity = final_table.sum()
#.loc adds it to the row
final_table.loc['SumIntensity'] = sum_intensity
# final_table

final_table_t = final_table.T
final_table_t = final_table_t.reset_index()
final_table_t.columns = ['Sample'] + list(final_table_t.columns[1:])

# Keep only the 'Sample' and 'LogSumIntensity' columns
final_table_t = final_table_t[['Sample', 'SumIntensity']]

log_SI = np.log(final_table_t['SumIntensity'])
final_table_t['Log SumIntensity'] = log_SI

# final_table_t

Merge MS data with ED50 data

In [None]:
#combine EC data with metabolomic data
EC_sum = pd.merge(EC50,final_table_t, on = 'Sample')
EC_sum

Correlation and plotting with Subset (can be skipped)

In [None]:
# EC_sum_subset = EC_sum[EC_sum['log EC50 (ug/ml)'] < 1]
# EC_sum_subset['log_si'].corr(EC_sum_subset['log EC50 (ug/ml)'])

Removed outliers (can be skipped)

In [None]:
# Removing outliers 
filt_EC_sum = EC_sum[~EC_sum['Sample'].isin(['S7', 'S25'])]

# Calculate Pearson correlation after filtering
pearson = filt_EC_sum['Log SumIntensity'].corr(filt_EC_sum['-logIC50'])
print(pearson)

In [None]:
#plotting with removed_outliers
sns.regplot(x=filt_EC_sum['Log SumIntensity'], y=filt_EC_sum['-EC5_old'])
pearson = filt_EC_sum['Log SumIntensity'].corr(filt_EC_sum['-EC5_old'])

plt.title('Pearson Correlation of Crude Extract ED50 Values and Capsaicinoid Content')
plt.ylabel('-Log ED50')
plt.xlabel('Log Sum Intensity of all Capsaicinoid features')

plt.text(0.01,-0.01, f'Pearson Correlation {pearson:.2f}',bbox=dict(facecolor='red', alpha=0.5))

plt.savefig('Data/cap_manu/ED50VSCapsaicinoids_noCap.svg')
plt.show()

plotting data without removal of outlier

In [None]:
# Calculate Spearman's correlation between 'LogSumIntensity' and 'Scoville Units'
correlation, p_value = spearmanr(EC_sum['Log SumIntensity'], EC_sum['logIC50'])

#Calculate pearson correlation
EC_sum['Log SumIntensity'].corr(EC_sum['logIC50'])

# Print the correlation value
print("Spearman's correlation between SumIntensity and ec50:", correlation)

In [None]:
#plotting with all data
sns.regplot(x=EC_sum['Log SumIntensity'], y=EC_sum['-EC5_old'])
pearson = EC_sum['Log SumIntensity'].corr(EC_sum['-EC5_old'])

plt.title('Pearson Correlation of Crude Extract EC50 Values and Capsaicinoid Content')
plt.ylabel('-Log ED50')
plt.xlabel('Log Sum Intensity of all Capsaicinoid features')

plt.text(0.01,-0.01, f'Pearson Correlation {pearson:.2f}',bbox=dict(facecolor='red', alpha=0.5))

plt.savefig('Data/cap_manu/test.svg')
plt.show()

## All compounds but the selected class

In [56]:
# Prompt the user to input specific row IDs
# selected_row_ids = input('Enter NPC#Class' )
# Capsaicins and Capsaicinoids
# Reduce the DataFrame to the selected row IDs
minus_table = table[~table['NPC#class'].astype(str).str.contains(selected_row_ids)]
#remove NPC class from table
minus_table_r  =minus_table[[c for c in minus_table.columns if c.startswith('S')]]
minus_table_r = minus_table[sorted(minus_table_r .columns, key=lambda c: int(c[1:]))]



In [None]:

si = minus_table_r.sum()
minus_table_r.loc['SumIntensity'] = si
minus_table_t = minus_table_r.T
minus_table_t = minus_table_t.reset_index()
minus_table_t.columns = ['Sample'] + list(minus_table_t.columns[1:])
# Keep only the 'Sample' and 'LogSumIntensity' columns
minus_table_t['log_si']= np.log(minus_table_t[['SumIntensity']])
# quant_t = quant_t[['Sample', 'SumIntensity']]

In [None]:
EC_sum_minus = pd.merge(EC50,minus_table_t, on = 'Sample')
EC_sum_minus

In [None]:
EC_sum_minus['log_si'].corr(EC_sum_minus['-logIC50'])

In [None]:
sns.regplot(x=EC_sum_minus['log_si'], y=EC_sum_minus['-EC50'])
pearson = EC_sum_minus['log_si'].corr(EC_sum_minus['-EC50'])
plt.title('Pearson Correlation: Crude Extract EC50 and non-Capsaicinoid Features')
plt.ylabel('-Log EC50')
plt.xlabel('Log Sum Intensity of all non-Capsaicinoid features')

plt.text(20,-0.5, f'Pearson Correlation {pearson:.2f}',bbox=dict(facecolor='red', alpha=0.5))

plt.savefig('Data/cap_manu/minus_caps_ec50.svg')
plt.show()

## All compounds

In [None]:
quant_r =quant[[c for c in quant.columns if c.startswith('S')]]
si = quant_r.sum()
quant_r.loc['SumIntensity'] = si
quant_t = quant_r.T
quant_t = quant_t.reset_index()
quant_t.columns = ['Sample'] + list(quant_t.columns[1:])
# Keep only the 'Sample' and 'LogSumIntensity' columns
quant_t['log_si']= np.log(quant_t[['SumIntensity']])

In [None]:
EC_sum_all = pd.merge(EC50,quant_t, on = 'Sample')
EC_sum_all

In [None]:
# Calculate Spearman's correlation between 'LogSumIntensity' and 'Scoville Units'
correlation, p_value = spearmanr(EC_sum_all['log_si'], EC_sum_all['EC50'])

# Print the correlation value
print("Spearman's correlation between SumIntensity and ec50:", correlation)

In [None]:
EC_sum_all['log_si'].corr(EC_sum_all['EC50'])