In [None]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns


## Importining of scovile data and feature data

Sample chips (S10) was removed from analysis due to the lack of Scoville information

In [None]:
scoville_df = pd.read_excel('Data/All_Chili_samples/input/chilis_and_scoville.xlsx')
scoville_df

In [None]:

# Read the CSV file with selected columns
df = pd.read_csv('Data/cap_manu/input/40_varieties_final_quant.csv', usecols=lambda col: 'row ID' in col or 'Peak area' in col)
df = df.rename(columns={c: c.split('_')[2] for c in df.columns if 'Peak area' in c})
df = df.drop(columns=['S10'])
df


## Selecting features identified as capsaicinoids

122;134;180;202;237;243;246;249;253;254;256;259;260;265;270;272;274;275;277;285;288;301;302;304;311;316;317;321;323;327;337;344;346;349;355;358;373;377;405;407;421;424;432;433;437;438;441;448;453;461;469;482;495;530;545;551;558;576;579;588;590;594;603;607;615;616;619;621;622;626;645;651;665;672;673;686;691;692;703;705;706;715;791;818;913;923;930;948;1117

In [None]:
# Prompt the user to input specific row IDs
selected_row_ids = input("Enter specific row IDs (comma-separated): ").split(';')

# Reduce the DataFrame to the selected row IDs
reduced_df = df[df['row ID'].astype(str).isin(selected_row_ids)]

reduced_df

## Taking sumintensitiy of the features of all features for a given sample

In [None]:
sum_intensity = reduced_df.iloc[:, 1:].sum(axis=0)  # Exclude the 'row ID' column

# Take the logarithm of the sum intensity values
#log_sum_intensity = (sum_intensity)

# Append the 'LogSumIntensity' row to the DataFrame
reduced_df.loc['SumIntensity'] = sum_intensity

reduced_df

In [None]:
# reduced_df.sum(axis=0)

In [None]:
# column_sums = np.sum(reduced_df, axis=0)
# column_sums

In [None]:
transposed_df = reduced_df.T

# Reset the index and rename the columns
transposed_df = transposed_df.reset_index()
transposed_df.columns = ['Sample'] + list(transposed_df.columns[1:])

# Remove the 'row ID' row
transposed_df = transposed_df[transposed_df['Sample'] != 'row ID']

# Keep only the 'Sample' and 'LogSumIntensity' columns
transposed_df = transposed_df[['Sample', 'SumIntensity']]


transposed_df

## Combining the scovile units with the sum feature intesity table and calculating the spearman p value

In [None]:
# Combine the two dataframes based on the 'Sample' column
combined_df = pd.merge(transposed_df, scoville_df, on='Sample')

# Convert 'LogSumIntensity' column to numeric type
combined_df['SumIntensity'] = pd.to_numeric(combined_df['SumIntensity'], errors='coerce')
combined_df['Scoville Units'] = pd.to_numeric(combined_df['Scoville Units'], errors='coerce')

# Calculate Spearman's correlation between 'LogSumIntensity' and 'Scoville Units'
correlation, p_value = spearmanr(combined_df['SumIntensity'], combined_df['Scoville Units'])

# Print the correlation value
print("Spearman's correlation between SumIntensity and Scoville Units:", correlation)

In [None]:
combined_df

In [None]:
# Check for missing or non-numeric values in 'LogSumIntensity' and 'Scoville Units' columns
missing_values = combined_df['LogSumIntensity'].isnull() | combined_df['Scoville Units'].isnull()
non_numeric_values = ~combined_df['LogSumIntensity'].apply(pd.to_numeric, errors='coerce').notnull() | \
                     ~combined_df['Scoville Units'].apply(pd.to_numeric, errors='coerce').notnull()

# Print the rows with missing or non-numeric values
print("Rows with missing or non-numeric values:")
print(combined_df[missing_values | non_numeric_values])

## Plotting the data

In [None]:

# Combine the two dataframes based on the 'Sample' column
combined_df = pd.merge(transposed_df, scoville_df, on='Sample')

# Convert 'LogSumIntensity' column to numeric type
combined_df['SumIntensity'] = pd.to_numeric(combined_df['SumIntensity'], errors='coerce')
combined_df['Scoville Units'] = pd.to_numeric(combined_df['Scoville Units'], errors='coerce')

# Calculate Spearman's correlation between 'LogSumIntensity' and 'Scoville Units'
correlation, p_value = spearmanr(combined_df['SumIntensity'], combined_df['Scoville Units'])
print("Spearman's correlation between Capsaicin and Dihydrocapq and Scoville Units:", correlation)

# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(combined_df['SumIntensity'], combined_df['Scoville Units'], s=30, alpha=0.5)
plt.plot(correlation, p_value, color='red')
plt.xlabel('SumIntensity')
plt.ylabel('Scoville Units')
plt.text(150000,3500000, f'p-value Spearmans {correlation:.2f}',bbox=dict(facecolor='red', alpha=0.5))



# Set logarithmic scale for the axes
plt.xscale('log')
plt.xlim(left=100000) #set to 1 to avoid log(0) error
plt.yscale('log')
plt.ylim(bottom=1) #set to 1 to avoid log(0) error
plt.title('Scatter Plot: Capsaicinoid Sum Intensity vs. Scoville Units')
plt.grid(True)
# plt.show()
plt.savefig("Data/All_Chili_samples/output/scatter_plot.svg", format='svg') #plt.show before will cause it not to save properly as it reoves the plot from memory