<a href="https://colab.research.google.com/github/profmcnich/example_notebook/blob/main/a3_sample_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

\(\^Be sure to update this button to point to your notebook instead of the sample notebook\)

In [51]:
# Imports section
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import sklearn
from sklearn import linear_model

## Part 1. Loading the dataset

In [52]:
# Using pandas load the dataset (load remotely, not locally)
df = pd.read_csv("https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv")

In [53]:
# Output the first 15 rows of the data
df.head(15)

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
0,469,647,624474.3
1,403,694,577961.0
2,302,975,619684.7
3,779,916,1460449.0
4,901,18,43257.26
5,545,637,712463.4
6,660,519,700696.0
7,143,869,271826.0
8,89,461,89198.03
9,294,776,477021.0


In [54]:
# Display a summary of the table information (number of datapoints, etc.)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature °C  1000 non-null   int64  
 1   Mols KCL        1000 non-null   int64  
 2   Size nm^3       1000 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 23.6 KB


In [55]:
df.describe()

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
count,1000.0,1000.0,1000.0
mean,500.5,471.53,508611.1
std,288.819436,288.482872,447483.8
min,1.0,1.0,16.11429
25%,250.75,226.75,129826.7
50%,500.5,459.5,382718.2
75%,750.25,710.25,760321.1
max,1000.0,1000.0,1972127.0


In [56]:
df.size

3000

In [57]:
df.dtypes

Temperature °C      int64
Mols KCL            int64
Size nm^3         float64
dtype: object

## Part 2. Splitting the dataset

In [58]:
# Take the pandas dataset and split it into our features (X) and label (y)
features = df [["Temperature °C","Mols KCL"]]
label = df["Size nm^3"]
X, y = features, label
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_test, y_train, y_test

(     Temperature °C  Mols KCL
 18              905        76
 336             555       836
 971             286       535
 588             956       235
 434              83       431
 ..              ...       ...
 31              950       993
 582             862       491
 21              709       332
 702             850       233
 146             610       678
 
 [900 rows x 2 columns],
      Temperature °C  Mols KCL
 660             966       871
 111             202       787
 511             524       255
 27              230       665
 774             382       250
 ..              ...       ...
 833               2         3
 817             380       957
 583             671       343
 928             855       547
 408             936       986
 
 [100 rows x 2 columns],
 18     1.485850e+05
 336    9.545885e+05
 971    3.176299e+05
 588    4.623699e+05
 434    7.784946e+04
            ...     
 31     1.926273e+06
 582    8.637160e+05
 21     4.824333e+05
 702    4.078

## Part 3. Perform a Linear Regression

In [59]:
# Use sklearn to train a model on the training set
sample = np.array([[430, 230], [100, 234], [560, 40], [43, 3]])

# Create a sample datapoint and predict the output of that sample with the trained model
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

prediction = lin_reg.predict(np.array([[0,1]]))

print(f"Prediction on the entire test data",prediction)

# Report on the score for that model, in your own words (markdown, not code) explain what the score means

# Extract the coefficents and intercept from the model and write an equation for your h(x) using LaTeX
# y = mx_1 + mx_2 + b

Prediction on the entire test data [-417687.04999136]


Sample equation: $E = mc^2$

## Part 4. Use Cross Validation

In [60]:
# Use the cross_val_score function to repeat your experiment across many shuffles of the data
my_cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(model, features, label, cv = my_cv)

print(scores)
print(scores.mean(), scores.std)
# Report on their finding and their significanc

[0.87616468 0.86951566 0.83708494 0.86963943 0.84945355 0.86236913
 0.82467112 0.85236386 0.8648058  0.76555589]
0.8471624047034034 <built-in method std of numpy.ndarray object at 0x7fb003aed1c0>


## Part 5. Using Polynomial Regression

In [61]:
from sklearn.preprocessing import PolynomialFeatures
# Using the PolynomialFeatures library perform another regression on an augmented dataset of degree 2
poly = PolynomialFeatures(2)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
print(f"Score:{model.score(X_train,y_train)}")
print(f"coefficent:{model.coef_}")
print(f"intercept:{model.intercept_}")
# Report on the metrics and output the resultant equation as you did in Part 3.
print(f"Predict:{model.predict(X_test)}")

Score:1.0
coefficent:[ 0.00000000e+00  1.20000000e+01 -1.16574196e-07 -3.58154474e-11
  2.00000000e+00  2.85714287e-02]
intercept:1.2060743756592274e-05
Predict:[1.71603946e+06 3.38068257e+05 2.75385857e+05 3.21295000e+05
 1.97369714e+05 6.19684714e+05 9.38430829e+05 3.51865257e+05
 7.23400314e+05 1.09042026e+06 1.33772103e+06 5.78913857e+05
 3.37428429e+05 4.13732829e+05 3.46296114e+05 1.65311429e+03
 4.95720000e+05 1.05541457e+05 6.73711143e+04 2.59291143e+04
 5.37425257e+05 1.10952031e+06 9.77468829e+05 1.29173829e+05
 1.62132686e+06 1.48395457e+05 3.31453714e+05 1.09212031e+06
 6.67142973e+01 3.28200257e+05 8.14090286e+04 6.00674857e+05
 1.50487143e+04 4.56054429e+05 2.44549600e+05 1.61217257e+05
 7.19365600e+05 2.51269829e+05 3.92907457e+05 1.07180386e+06
 6.61005000e+05 4.01612429e+05 5.73634286e+04 7.91715314e+05
 5.76091143e+04 4.52268000e+05 1.77567400e+05 1.24615340e+06
 1.81699114e+05 1.78553203e+06 2.17334314e+05 4.81235314e+05
 3.91831429e+03 3.09613114e+05 6.16457029e+05 