In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression

In [2]:
r_df = pd.read_csv('R.csv')
g_df = pd.read_csv('G.csv')
b_df = pd.read_csv('B.csv')
rgb_df = pd.read_csv('RGB.csv')

### Question 1: Which color has the highest correlation value for each measure? 

In [3]:

#Pearson color limited (red, blue, or green)
pearson_co_lim_red = stats.pearsonr(r_df['R'], r_df['R(Red)'])
pearson_co_lim_green = stats.pearsonr(g_df['G'], g_df['G(Green)'])
pearson_co_lim_blue = stats.pearsonr(b_df['B'], b_df['B(Blue)'])

print(f'Pearson Red color limited {pearson_co_lim_red}')
print(f'Pearson Green color limited {pearson_co_lim_green}')
print(f'Pearson Blue color limited {pearson_co_lim_blue}\n')

#spearman color limited(red, blue, or green)
spearman_co_lim_red = stats.spearmanr(r_df['R'], r_df['R(Red)'])
spearman_co_lim_green = stats.spearmanr(g_df['G'], g_df['G(Green)'])
spearman_co_lim_blue = stats.spearmanr(b_df['B'], b_df['B(Blue)'])

print(f'Spearman Red color limited {spearman_co_lim_red}')
print(f'Spearman Green color limited {spearman_co_lim_green}')
print(f'Spearman Blue color limited {spearman_co_lim_blue}\n')

#kendall color limited(red, blue, or green)
kendall_co_lim_red = stats.kendalltau(r_df['R'], r_df['R(Red)'])
kendall_co_lim_green = stats.kendalltau(g_df['G'], g_df['G(Green)'])
kendall_co_lim_blue = stats.kendalltau(b_df['B'], b_df['B(Blue)'])

print(f'Kendall Red color limited {kendall_co_lim_red}')
print(f'Kendall Green color limited {kendall_co_lim_green}')
print(f'Kendall Blue color limited {kendall_co_lim_blue}\n')

#spearman full led(red, blue, or green)
spearman_full_led_red = stats.spearmanr(r_df['R'], r_df['R(LED on)'])
spearman_full_led_green = stats.spearmanr(g_df['G'], g_df['G(LED on)'])
spearman_full_led_blue = stats.spearmanr(b_df['B'], b_df['B(LED on)'])

print(f'Spearman Red full LED on {spearman_full_led_red}')
print(f'Spearman Green full LED on {spearman_full_led_green}')
print(f'Spearman Blue full LED on {spearman_full_led_blue}\n')

#pearson full led(red, blue, or green)
pearson_full_led_red = stats.pearsonr(r_df['R'], r_df['R(LED on)'])
pearson_full_led_green = stats.pearsonr(g_df['G'], g_df['G(LED on)'])
pearson_full_led_blue = stats.pearsonr(b_df['B'], b_df['B(LED on)'])

print(f'Pearson Red full LED on {pearson_full_led_red}')
print(f'Pearson Green full LED on {pearson_full_led_green}')
print(f'Pearson Blue  full LED on {pearson_full_led_blue}\n')

#kendall full led(red, blue, or green)
kendall_full_led_red = stats.kendalltau(r_df['R'], r_df['R(LED on)'])
kendall_full_led_green = stats.kendalltau(g_df['G'], g_df['G(LED on)'])
kendall_full_led_blue = stats.kendalltau(b_df['B'], b_df['B(LED on)'])

print(f'Kendall Red full LED on {kendall_full_led_red}')
print(f'Kendall Green full LED on {kendall_full_led_green}')
print(f'Kendall Blue  full LED on {kendall_full_led_blue}\n')


Pearson Red color limited PearsonRResult(statistic=0.7536896362253166, pvalue=8.359127317823591e-09)
Pearson Green color limited PearsonRResult(statistic=0.8340125073361087, pvalue=6.9527671212699345e-12)
Pearson Blue color limited PearsonRResult(statistic=0.8865670469538409, pvalue=5.700864539786063e-15)

Spearman Red color limited SignificanceResult(statistic=0.8258845573870119, pvalue=1.6702819985090264e-11)
Spearman Green color limited SignificanceResult(statistic=0.8353829467028415, pvalue=5.969917045398561e-12)
Spearman Blue color limited SignificanceResult(statistic=0.7975186611532058, pvalue=2.5815831009369814e-10)

Kendall Red color limited SignificanceResult(statistic=0.6397373572173887, pvalue=3.1232644286017545e-09)
Kendall Green color limited SignificanceResult(statistic=0.6646964925231432, pvalue=1.031167358408612e-09)
Kendall Blue color limited SignificanceResult(statistic=0.6209734253156467, pvalue=8.948772561392046e-09)

Spearman Red full LED on SignificanceResult(stat

### Questions 2-4: Below is a function used to answer questions 2-4 to avoid redundancy

In [4]:
def concordant(df, col_lim_column, led_column):
    # Create a copy of the DataFrame
    df2 = df.copy()

    # Rank the 'RGB' column and label it as 'RGB_Rank'
    df2[col_lim_column + '_Rank'] = df2[col_lim_column].rank()

    # Rank the 'RGB(LED on)' column and label it as 'RGB(LED on)_Rank'
    df2[led_column + '_Rank'] = df2[led_column].rank()

    # Calculate Concordant and Discordant values for col_lim_column
    concordant = []
    discordant = []

    for i in range(len(df2[col_lim_column + '_Rank'])):
        concordant_count = len([x for x in df2[col_lim_column + '_Rank'][i + 1:] if x > df2[col_lim_column + '_Rank'][i]])
        discordant_count = len([x for x in df2[col_lim_column + '_Rank'][i + 1:] if x <= df2[col_lim_column + '_Rank'][i]])
        concordant.append(concordant_count)
        discordant.append(discordant_count)

    df2['Concordant'] = concordant
    df2['Discordant'] = discordant

    # Calculate Concordant and Discordant values for led_column
    concordant_led = []
    discordant_led = []

    for i in range(len(df2[led_column + '_Rank'])):
        concordant_count_led = len([x for x in df2[led_column + '_Rank'][i + 1:] if x > df2[led_column + '_Rank'][i]])
        discordant_count_led = len([x for x in df2[led_column + '_Rank'][i + 1:] if x <= df2[led_column + '_Rank'][i]])
        concordant_led.append(concordant_count_led)
        discordant_led.append(discordant_count_led)

    df2['Concordant_LED'] = concordant_led
    df2['Discordant_LED'] = discordant_led

    result = df2[(df2['Discordant'] == 0) & (df2['Discordant_LED'] == 0)]['Color'].values[0]

    return result

### Question 2: With the red experiment, the first color where both versions of the measurement are fully concordant is

In [5]:
r_result = concordant(r_df, 'R(Red)', 'R(LED on)')
print("Result for r_df:", r_result)

Result for r_df: Rookwood Amber


### Question 3: With the Green experiment, the first color where both versions of the measurement are fully concordant is

In [6]:
g_result = concordant(g_df, 'G(Green)', 'G(LED on)')
print("Result for g_df:", g_result)

Result for g_df: Poolhouse


### Question 4: With the Blue experiment, the first color where both versions of the measurement are fully concordant is

In [7]:
b_result = concordant(b_df, 'B(Blue)', 'B(LED on)')
print("Result for b_df:", b_result)

Result for b_df: Naval


### Question 5: Which color benefits the most by considering a Spearman correlation coefficient as opposed to a Pearson correlation coefficient?

Considering that a Lower p-value indicates a stronger correlation and looking at the Spearman and Pearson correlations' p-values for each color:

Pearson P-values:

Red: 8.359127317823591e-09
Green: 6.9527671212699345e-12
Blue: 5.700864539786063e-15
Spearman P-values:

Red: 1.6702819985090264e-11
Green: 5.969917045398561e-12
Blue: 2.5815831009369814e-10

it's clear that the "Red" color exhibits the most significant difference between Spearman and Pearson correlations. The p-value for the "Red" color in the Spearman correlation is larger than the p-value in Pearson correlation, proving a stronger Spearman correlation.

### Question 6: Which correlation coefficient has the least range across the experiment?

In [8]:
# Define the correlation values for each type and configuration
correlation_values = {
    "Pearson Full": [pearson_full_led_red[0], pearson_full_led_green[0], pearson_full_led_blue[0]],
    "Pearson Color Limited": [pearson_co_lim_red[0], pearson_co_lim_green[0], pearson_co_lim_blue[0]],
    "Kendall Full": [kendall_full_led_red[0], kendall_full_led_green[0], kendall_full_led_blue[0]],
    "Kendall Color Limited": [kendall_co_lim_red[0], kendall_co_lim_green[0], kendall_co_lim_blue[0]],
    "Spearman Full": [spearman_full_led_red.correlation, spearman_full_led_green.correlation, spearman_full_led_blue.correlation],
    "Spearman Color Limited": [spearman_co_lim_red.correlation, spearman_co_lim_green.correlation, spearman_co_lim_blue.correlation]
}

# Calculate the range for each type of correlation
correlation_ranges = {correlation: max(values) - min(values) for correlation, values in correlation_values.items()}

# Find the correlation type with the smallest range
correlation_with_least_range = min(correlation_ranges, key=correlation_ranges.get)

# Print the correlation type with the smallest range
print(f"The correlation with the least range across the experiment is: {correlation_with_least_range}")


The correlation with the least range across the experiment is: Spearman Color Limited


### Question 7: The range of the identified correlation (to three decimal places) is from 

In [9]:
# Define the correlation values for Spearman Color Limited (my output from Q6)
spearman_color_limited = [spearman_co_lim_red.correlation, spearman_co_lim_green.correlation, spearman_co_lim_blue.correlation]

# Calculate the maximum and minimum values
max_spearman_color_limited = max(spearman_color_limited)
min_spearman_color_limited = min(spearman_color_limited)

# Print the maximum and minimum values for Spearman Color Limited
print(f"Min: {round(min_spearman_color_limited, 3)}", f"Max: {round(max_spearman_color_limited, 3)}")


Min: 0.798 Max: 0.835


### Question 8: Which color's experiment results appear to have the least effect (i.e., the change in correlation coefficients is the least)? For this and future questions, this compares the LED correlation to the color-limited correlation.

In [10]:
pearson_correlations = {
    'Red': pearson_full_led_red[0] - pearson_co_lim_red[0],
    'Green': pearson_full_led_green[0] - pearson_co_lim_green[0],
    'Blue': pearson_full_led_blue[0] - pearson_co_lim_blue[0],
}

# Calculate the absolute differences
absolute_differences = {color: abs(difference) for color, difference in pearson_correlations.items()}

# Find the color with the smallest absolute difference
least_effect_color = min(absolute_differences, key=absolute_differences.get)

# Output the color with the least effect
print(f"The color with the least effect is {least_effect_color}")


The color with the least effect is Blue


### Question 9: Order the experimental results by the magnitude of their difference (smallest first). This question again refers to the difference in the correlations between Full LED and Color-Limited

In [11]:
# Create a list of (color, correlation type, absolute difference) tuples
correlation_differences = []

correlations = {
    'Red': (pearson_co_lim_red[0], spearman_co_lim_red[0], kendall_co_lim_red[0], pearson_full_led_red[0], spearman_full_led_red[0], kendall_full_led_red[0]),
    'Green': (pearson_co_lim_green[0], spearman_co_lim_green[0], kendall_co_lim_green[0], pearson_full_led_green[0], spearman_full_led_green[0], kendall_full_led_green[0]),
    'Blue': (pearson_co_lim_blue[0], spearman_co_lim_blue[0], kendall_co_lim_blue[0], pearson_full_led_blue[0], spearman_full_led_blue[0], kendall_full_led_blue[0]),
}

for color in correlations:
    pearson_co_lim, spearman_co_lim, kendall_co_lim, pearson_full_led, spearman_full_led, kendall_full_led = correlations[color]
    
    correlation_differences.append((color, 'Pearson Color Limited', abs(pearson_co_lim - pearson_full_led)))
    correlation_differences.append((color, 'Spearman Color Limited', abs(spearman_co_lim - spearman_full_led)))
    correlation_differences.append((color, 'Kendall Color Limited', abs(kendall_co_lim - kendall_full_led)))

# Sort the results by absolute difference (smallest first)
sorted_correlation_differences = sorted(correlation_differences, key=lambda x: x[2])

# Output the sorted results
for result in sorted_correlation_differences:
    print(f"{result[1]} {result[0]}: Absolute Difference = {result[2]}")


Spearman Color Limited Blue: Absolute Difference = 0.00933967367862587
Pearson Color Limited Blue: Absolute Difference = 0.012997750316645407
Kendall Color Limited Blue: Absolute Difference = 0.04920511815258144
Spearman Color Limited Red: Absolute Difference = 0.11879238986672747
Pearson Color Limited Green: Absolute Difference = 0.1273677886380704
Spearman Color Limited Green: Absolute Difference = 0.1341250562375128
Pearson Color Limited Red: Absolute Difference = 0.17007871021847132
Kendall Color Limited Red: Absolute Difference = 0.17505419487907137
Kendall Color Limited Green: Absolute Difference = 0.21640697382754592


### Question 10: Of the 18 calculated correlation coefficients between matched pairs, how many are above 0.75?

In [12]:
correlation_coefficients = [
    pearson_full_led_red[0], pearson_co_lim_red[0],
    pearson_full_led_green[0], pearson_co_lim_green[0],
    pearson_full_led_blue[0], pearson_co_lim_blue[0],
    spearman_full_led_red[0], spearman_co_lim_red[0],
    spearman_full_led_green[0], spearman_co_lim_green[0],
    spearman_full_led_blue[0], spearman_co_lim_blue[0],
    kendall_full_led_red[0], kendall_co_lim_red[0],
    kendall_full_led_green[0], kendall_co_lim_green[0],
    kendall_full_led_blue[0], kendall_co_lim_blue[0]
]

# Count the number of coefficients above 0.75
count_above_075 = sum(1 for coefficient in correlation_coefficients if coefficient > 0.75)

print(f"Number of correlation coefficients above 0.75: {count_above_075}")


Number of correlation coefficients above 0.75: 14


### Question 11

In [13]:
correlation_matrix = rgb_df[['R', 'G', 'B', 'RL', 'GL', 'BL']].corr()
print(correlation_matrix)

           R         G         B        RL        GL        BL
R   1.000000  0.502334  0.099424  0.923768  0.580896  0.362886
G   0.502334  1.000000  0.581961  0.645451  0.961380  0.795038
B   0.099424  0.581961  1.000000  0.228403  0.554801  0.899565
RL  0.923768  0.645451  0.228403  1.000000  0.736055  0.519279
GL  0.580896  0.961380  0.554801  0.736055  1.000000  0.826091
BL  0.362886  0.795038  0.899565  0.519279  0.826091  1.000000


### Question 12

In [14]:
# Calculate corr between Blue (B) and Red from LED (RL)
correlation_blue_red = rgb_df['B'].corr(rgb_df['RL'])

# Calculate average and standard deviation of Blue (B) values
average_blue = rgb_df['B'].mean()
std_dev_blue = rgb_df['B'].std()

#Calculate average and standard deviation of Red from LED (RL) values
average_red_led = rgb_df['RL'].mean()
std_dev_red_led = rgb_df['RL'].std()

# Calculate  parameters for the line of best fit
slope = correlation_blue_red * (std_dev_red_led / std_dev_blue)
intercept = average_red_led - slope * average_blue

#Print results
correlation_blue_red = round(correlation_blue_red, 3)
average_blue = round(average_blue, 3)
std_dev_blue = round(std_dev_blue, 3)
average_red_led = round(average_red_led, 3)
std_dev_red_led = round(std_dev_red_led, 3)
slope = round(slope, 3)
intercept = round(intercept, 3)

print(f"Average Blue: {average_blue}")
print(f"Standard Deviation Blue: {std_dev_blue}")
print(f"Average Red from LED: {average_red_led}")
print(f"Standard Deviation Red from LED: {std_dev_red_led}")
print(f"Correlation Coefficient: {correlation_blue_red}")
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")

Average Blue: 107.524
Standard Deviation Blue: 58.614
Average Red from LED: 603.714
Standard Deviation Red from LED: 352.472
Correlation Coefficient: 0.228
Slope: 1.373
Intercept: 456.031


### Question 13

In [15]:
# Extract Red LED readings (RL) for entire DataFrame
rl_values = rgb_df['RL'].values

# Calculate natural log of RL
ln_rl_values = np.log(rl_values)

# Fit linear model to transformed data
coefficients = np.polyfit(rgb_df['R'], ln_rl_values, 1)
slope, intercept = coefficients

# Calculate initial value (P)
e_to_the_power_of_intercept = np.exp(intercept)

#Calculate corr coefficient before and after the transformation
original_correlation = np.corrcoef(rgb_df['R'], rl_values)[0, 1]
transformed_correlation = np.corrcoef(rgb_df['R'], ln_rl_values)[0, 1]

# Calculate net increase in the correlation coefficient
net_increase_correlation = transformed_correlation - original_correlation

# Create a DataFrame with the natural log of RL values to identify new Julep value
colors = rgb_df['Color'].values
ln_rl_df = pd.DataFrame({'Color': colors, 'ln(RL)': ln_rl_values})

# Print the results
print(f"For the color Julep, the new appropriate reading is {round(ln_rl_df.iloc[13, 1], 3)}")
print(f"The slope for the new linear model is {round(slope, 3)}")
print(f"The intercept for the new linear model is {round(intercept, 3)}")
print(f"Corresponding to an initial value (P) of {round(e_to_the_power_of_intercept, 3)}")
print(f"And a net increase in the correlation coefficient of {round(net_increase_correlation, 3)}")

For the color Julep, the new appropriate reading is 6.026
The slope for the new linear model is 0.008
The intercept for the new linear model is 5.142
Corresponding to an initial value (P) of 171.123
And a net increase in the correlation coefficient of 0.026


### Question 14

The correct answer is False
The Spearman Correlation coefficient is a better measure of correlation for this experiment because the data not linear. 

### Question 15

The TCS34725 sensor should be calibrated with a 255,255,255 light.