![](Logo.png)

# <font color='red'>Sensory Quality and Composition of NCSU Peanut Germplasm </font>

> ## Correlation Among Attributes
> ## Clustering Based on Attributes
> ## Cluster Mean Separation
> ## Germplasm Selection

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Import Data
data = pd.read_excel('Flavor Data - NCSU.xlsx', sheet_name='Data')

In [None]:
# Head of Data
data.head()

In [None]:
# Calculate variance for subsetting Data
from statistics import variance

for col in data.columns[6:]:
    print("Variance of %s = %s"%(col, variance(data[col].dropna())))

In [None]:
# Descriptive statistics of columns 5 through 17
data[list(data.columns[6:18])].dropna().describe()

In [None]:
# Columns 5 through 17
#sns.pairplot(data[list(data.columns[6:18])].dropna())

In [None]:
# Data columns for subsetting - Columns 1 through 17 dropping Astringent
data.columns

In [None]:
data.head()

In [None]:
# Subset of Data
data_sub = data[['GIN', 'NC_Accession', 'Seed Source', 'Rep', 'mean_oil', 'raw_mc_ww',
       'roast_color', 'paste_color', 'dark_roast', 'raw_bean', 'roast_peanut',
       'sweet_aromatic', 'sweet', 'bitter', 'wood_hulls_skins','cardboard']].dropna()

In [None]:
# Pairplot for subset of Data
#sns.pairplot(data_sub[list(data_sub.columns[5:])])

In [None]:
# Checks?
Cultivars = ['Bailey','Bailey II','Emery','Sullivan','Wynne','Georgia 06-G','Bailey-UPPT','Georgia 06-G-UPPT']

In [None]:
data_test = data_sub.groupby(['GIN','NC_Accession'], as_index=False)['mean_oil', 'raw_mc_ww',
       'roast_color', 'paste_color', 'dark_roast', 'raw_bean', 'roast_peanut',
       'sweet_aromatic', 'sweet', 'bitter','cardboard'].mean()

In [None]:
data_test

In [None]:
data_test['Check'] = data_test['NC_Accession'].apply(lambda x: 'Yes' if x in Cultivars else 'No')

In [None]:
# Produce Correlation Plots - Dimension Reduction Justification
corr = data_test.corr()

corr = corr.iloc[0:11,0:11]

corr.rename(columns={'mean_oil':'Mean Oil Content','raw_mc_ww':'Raw Moisture Content','roast_color':'Roast Color','paste_color':'Paste Color','dark_roast':'Dark Roast','raw_bean':'Raw Bean','roast_peanut':'Roast Peanut','sweet_aromatic':'Sweet Aromatic','sweet':'Sweetness','bitter':'Bitterness','cardboard':'Cardboard'},
            index={'mean_oil':'Mean Oil Content','raw_mc_ww':'Raw Moisture Content','roast_color':'Roast Color','paste_color':'Paste Color','dark_roast':'Dark Roast','raw_bean':'Raw Bean','roast_peanut':'Roast Peanut','sweet_aromatic':'Sweet Aromatic','sweet':'Sweetness','bitter':'Bitterness','cardboard':'Cardboard'}, inplace=True)

#corr.head()

# Generate a Mask for the Upper Triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set Up the Matplotlib Figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a Custom Diverging Colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the Heatmap with the Mask and Correct Aspect Ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# Creates New Columns for 2 - 10 Clusters to View in Interactive Plots
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Control Panel
clus_total = 10
pca_comp = 5

for i in range(2,(clus_total+1)):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(data_test.iloc[:,2:13])
    
    data_test['Clusters_%s'%(i)] = kmeans.labels_
    
scaler = StandardScaler()
scaler.fit(data_test.iloc[:,2:13])
scaled_data = scaler.transform(data_test.iloc[:,2:13])

pca = PCA(n_components=pca_comp)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

for i in range(0,pca_comp):
    data_test['PCA%s'%(i+1)] = x_pca[:,i]

In [None]:
# Interactive Plots to Visualice Clusters - Use the Control Panel to See the Number of Clusters, pca_init and pca_post Data

import plotly.express as px
# Control Panel
# Select Number of Clusters (2-10)
Clusters = 3
# Select PCA Component (1-5)
PCA_init = 'PCA1'
# Select PCA Component (1-5)
PCA_post = 'PCA2'

# Create a New Data Column, 'Cultivars' and 'Cluster i' where i is the Number of Clusters Defined in Clusters
def new_char(cols):
    Check = cols[0]
    Cluster = cols[1]
    
    if Check == 'Yes':
        return 'Cultivar'
    
    else:
        for i in range(0,Clusters):
            if Cluster == i:
                return 'Cluster %s'%(i+1)

# Set the New Column to Color for Dispaying the Colors Below
Color = data_test[['Check','Clusters_%s'%(Clusters)]].apply(new_char, axis=1)
# Plot Using plotly.express
fig = px.scatter(data_test, x="%s"%(PCA_init), y="%s"%(PCA_post), opacity=0.7, color=Color ,hover_data=['NC_Accession'], template='plotly_white')
# Change the Marker Sizes and Attributes
fig.update_traces(marker=dict(size=15, line=dict(width=2, color='DarkSlateGrey')), selector=dict(mode='markers'))
# Display the Figure
fig.show()

In [None]:
# Output PCA Contributions - Excel or Print
pca_contributions = pd.DataFrame(abs(pca.components_), index=['PCA1','PCA2','PCA3','PCA4','PCA5'], columns=['mean_oil', 'raw_mc_ww', 
        'roast_color', 'paste_color', 'dark_roast',
       'raw_bean', 'roast_peanut', 'sweet_aromatic', 
        'sweet', 'bitter', 'cardboard'])
#pca_contributions.to_excel('PCA Variable Contributions.xlsx', index=True)
pca_contributions

In [None]:
# Print Percentage Explained Variance for PCA Components
print("Explained Variance Per PCA =", pca.explained_variance_ratio_)
print("Total Explained Variance for PCAs =",pca.explained_variance_ratio_.sum())

In [None]:
# Import Libraries
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

In [None]:
# Add Label Column for Sensory Attributes
data_test['Label'] = data_test[['Check','Clusters_%s'%(Clusters)]].apply(new_char, axis=1)

# Summary Stats of Traits by Flavor Segment
data_test.groupby('Label')[['mean_oil', 'raw_mc_ww', 'roast_color', 'paste_color', 'dark_roast',
       'raw_bean', 'roast_peanut', 'sweet_aromatic', 'sweet', 'bitter',
       'cardboard']].mean()

In [None]:
# Control Variable - Change to Print Output of Other Sensory Attributes
test = 'cardboard'

# One-Way ANOVA
model = ols('%s ~ C(Label)'%(test), data=data_test).fit()
print(sm.stats.anova_lm(model, typ=2))
print('--------------------------------------------------------------------')

# Normality of Residuals
print('Shapiro-Wilks Normality',stats.shapiro(model.resid))
print('--------------------------------------------------------------------')

# Homoscedasticity of Variances - Three Clusters
print(stats.levene(data_test['%s'%(test)][data_test['Label'] == 'Cluster 1'],
                   data_test['%s'%(test)][data_test['Label'] == 'Cluster 2'],
                   data_test['%s'%(test)][data_test['Label'] == 'Cluster 3'],
                   data_test['%s'%(test)][data_test['Label'] == 'Cultivar']))
print('--------------------------------------------------------------------')

# Post-Hoc Comparisons
mc = MultiComparison(data_test['%s'%(test)], 
                     data_test['Label'])
mc_results = mc.tukeyhsd()
print(mc_results)

In [None]:
# Pull Data Columns for Merging Color Groups to Flavor Segments
c_group_hi = data[['GIN','NC_Accession','color_groups']][data['hi-rep'] == 'x']
c_group_pref = data[['GIN','NC_Accession','color_groups']][data['best_color'] == 'x']
# Merge Dataframes Together to Capture Percentages of Each Flavor Segment
c_group_merge = pd.merge(c_group_hi, c_group_pref, how='left', on=['GIN','NC_Accession'], suffixes=['_hi_rep','_best_color'])
c_group_merge.head(15)

In [None]:
# Pull Data Columns for Merging Color Groups to Flavor Segments
c_group_hi = data[['GIN','NC_Accession','color_groups']][data['hi-rep'] == 'x']
c_group_pref = data[['GIN','NC_Accession','color_groups']][data['best_color'] == 'x']
# Merge Dataframes Together to Capture Percentages of Each Flavor Segment
c_group_merge = pd.merge(c_group_hi, c_group_pref, how='left', on=['GIN','NC_Accession'], suffixes=['_hi_rep','_best_color'])

clus_color = pd.merge(c_group_merge, data_test, how='left', on='GIN')

# Print Percentages of Each Flavor Segment
for i in clus_color['Label'].unique():
    print("Color Group Percentages for",i)
    print(clus_color['color_groups_best_color'][clus_color['Label'] == '%s'%(i)].value_counts(normalize=True))

In [None]:
del clus_color['NC_Accession_y']
clus_color.rename(columns={'NC_Accession_x':'NC_Accession','color_groups_hi_rep':'hi_rep','color_groups_best_color':'best_color'}, inplace=True)

In [None]:
Flavor_Selections = clus_color[['GIN', 'NC_Accession', 'hi_rep', 'best_color', 'mean_oil', 'raw_mc_ww',
       'roast_color', 'paste_color', 'dark_roast', 'raw_bean', 'roast_peanut',
       'sweet_aromatic', 'sweet', 'bitter', 'cardboard','Label']][(clus_color['Label'] != 'Cultivar')]
Flavor_Selections.to_csv('Flavor Selection Segment.csv', index=False)