<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Case-study-description" data-toc-modified-id="Case-study-description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Case study description</a></span></li><li><span><a href="#Try-some-Python-commands-yourself" data-toc-modified-id="Try-some-Python-commands-yourself-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Try some Python commands yourself</a></span></li></ul></div>

Details about the notebook

* Load data
* Explore data: table, plots
* Load another dataset and merge
* Choose a column and predict another column, based on a least-squares model
* Quantify how good the predictions are
* Try different prediction models
* User-interface to test our predictions
* Export the results
* Try Python commands yourself
* Next steps

## Case study description

Goes here

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook" # jupyterlab

In [None]:
import plotly.graph_objects as go

In [None]:
import numpy as np
import pandas as pd
import plotly
pd.options.plotting.backend = "plotly"

In [None]:
spectra = pd.read_csv('calX.csv', index_col=0)
spectra.shape

In [None]:
# Fix up the column names: spectral wavelengths, from 600 to 1900nm, in steps of 2nm
spectra.columns = np.arange(600, 1900, 2)   # (start, stop, stepsize)

In [None]:
spectra.head()

In [None]:
#df = spectra.T
#df.shape

In [None]:
# Show the top of the data set
#df.head()

In [None]:
# Show the end of the data set
#df.tail()

In [None]:
# Show 10 randomly selected rows
#df.sample(10)

In [None]:
# Show a randomly selected row; and plot these rows
spectra.sample(1).iloc[0].plot(title="Plot of a randomly selected spectrum")


In [None]:
# Improve the figure
fig=spectra.sample(1).iloc[0].plot(title="Plot of a randomly selected spectrum")
fig.update_layout(xaxis_title_text="Wavelength [nm]")
fig.update_layout(yaxis_title_text="Absorbance")

In [None]:
outputs = pd.read_csv('calY.csv', index_col=0)
outputs.shape

In [None]:
outputs.head()

In [None]:
outputs.columns = ['Hardness', 'Activity', 'Something']
outputs

In [None]:
# Explore the outputs
display(outputs['Something'].plot.line())
display(outputs['Something'].plot.hist(nbins=30))
display(outputs['Something'].plot.box())

In [None]:
# Summary statistics for each column
display(outputs.mean())
display(outputs.median())
display(outputs.std())
display(outputs.min())


In [None]:
spectra.index


In [None]:
outputs.index

In [None]:
# Join the two data sets
joined = spectra.merge(outputs, left_index=True, right_index=True)


In [None]:
joined.columns


In [None]:
# Select a column for a particular wavelength
wavelength  = 1800
fig=joined.loc[:, wavelength].plot(title=f"Plot of absorbances for all tablets at wavelength {wavelength} nm")
fig.update_layout(xaxis_title_text="Tablet number")
fig.update_layout(yaxis_title_text=f"Absorbance at {wavelength} nm")

In [None]:
# Correlation plot at a particular wavelength against ____
wavelength  = 1666
two_columns = joined.loc[:, [wavelength, 'Something']]
# display(two_columns)
fig=two_columns.plot.scatter(x=wavelength, y='Something')
fig.update_layout(xaxis_title_text=f"Absorbance at {wavelength} nm")
fig.update_layout(yaxis_title_text="Something")

In [None]:
# TODO: slider to find the strongest correlation

In [None]:
all_correlations = joined.corr()['Something']

In [None]:
all_correlations.plot()

In [None]:
display(  all_correlations.max()          )
display(  all_correlations[0:-1].max()    )
display(  all_correlations[0:-1].argmax() )

In [None]:
# So wavelength in position 533 is the largest: corresponds to wavelength ...
joined.columns[533]

In [None]:
best_wavelength = 1666
joined.loc[:, best_wavelength]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

mymodel = LinearRegression()

In [None]:
X = joined.loc[:, [best_wavelength]]
mymodel.fit(X, y=joined["Something"])

In [None]:
# The coefficients
print(f'Intercept = {mymodel.intercept_} and slope = {mymodel.coef_}')

# The mean squared error:
actual_y_values = joined["Something"]
predicted_y_values = mymodel.predict(X)
prediction_error = actual_y_values - predicted_y_values    
fig=prediction_error.hist(nbins=40)

In [None]:
print(f'Mean squared error: {mean_squared_error(actual_y_values, predicted_y_values, squared=False)}')
      
# The coefficient of determination: (R^2)
print(f'Coefficient of determination = R^2 = {r2_score(actual_y_values, predicted_y_values):.3f}')

## Try some Python commands yourself

In [None]:
print('Hi, my name is ____.')

In [None]:
# Creating variables:

temperature_in_F = 212.0
temperature_in_C = ...
