<a href="https://colab.research.google.com/github/profmcnich/example_notebook/blob/main/a3_sample_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
# Imports section
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

## Part 1. Loading the dataset

In [44]:
# Using pandas load the dataset (load remotely, not locally)
slime = pd.read_csv("https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv")

# Output the first 15 rows of the data
print(slime.head(15), end="\n\n\n\n")

# Display a summary of the table information (number of datapoints, etc.)
print(slime.info())

    Temperature °C  Mols KCL     Size nm^3
0              469       647  6.244743e+05
1              403       694  5.779610e+05
2              302       975  6.196847e+05
3              779       916  1.460449e+06
4              901        18  4.325726e+04
5              545       637  7.124634e+05
6              660       519  7.006960e+05
7              143       869  2.718260e+05
8               89       461  8.919803e+04
9              294       776  4.770210e+05
10             991       117  2.441771e+05
11             307       781  5.006455e+05
12             206        70  3.145200e+04
13             437       599  5.390215e+05
14             566        75  9.185271e+04



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature °C  1000 non-null   int64  
 1   Mols KCL        1000 non-null   int64  
 2   Size nm^3       100

## Part 2. Splitting the dataset

In [45]:
# Take the pandas dataset and split it into our features (X) and label (y)
features_x = slime.iloc[0:, 0:2]
label_y = slime.iloc[0:, 2]


# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
x_train, x_test, y_train, y_test = train_test_split(features_x, label_y, train_size = 0.9, test_size = 0.1)

## Part 3. Perform a Linear Regression

In [46]:
# Use sklearn to train a model on the training set
reg = LinearRegression()
model = reg.fit(x_train, y_train)

# Create a sample datapoint and predict the output of that sample with the trained model
predictions = reg.predict(x_test)
print("Predictions: ", predictions)

Predictions:  [ 5.34930333e+05  7.51519290e+05  9.09896431e+05 -7.52177430e+04
  1.20991155e+05  1.07579946e+05  1.37872683e+05  1.10612883e+06
 -3.25850143e+05  5.07193250e+05  4.58242351e+05  8.72064674e+05
  5.87855573e+05  8.70565404e+05 -4.08534222e+05  3.37124454e+05
  1.53491969e+04  4.02081158e+05  3.14131990e+05  3.46744326e+05
  7.83588661e+05  4.50526373e+05  1.56042617e+05  9.88814657e+04
  2.94637729e+05  2.26672057e+05 -2.81874551e+05  4.01689634e+05
  4.39223220e+05  1.15973202e+05  1.15891573e+06  4.87438754e+05
  8.01301930e+05  4.97067013e+05 -4.00615768e+05  8.64860098e+05
  8.87421415e+05  2.01728695e+05  4.25506787e+05  4.73950313e+05
 -1.22410144e+05  9.88025268e+05 -1.60478962e+05  5.28332856e+05
  1.18885757e+06  7.23172086e+05  7.98150262e+05  5.01568198e+05
  3.33053410e+05  1.92104793e+05  1.00336286e+06 -2.73956097e+05
  3.76157310e+05  4.22917555e+05  8.48947974e+05  5.97856893e+05
  1.05072985e+05  7.25506119e+05  1.14654176e+06 -5.29777722e+04
  6.5301442

In [47]:
# Report on the score for that model, in your own words (markdown, not code) explain what the score means
print("R^2 value: ",reg.score(features_x, label_y))

R^2 value:  0.8606652835638592


#### What does the score mean? 
- R-squared is a goodness-of-fit measure for linear regression models. Basically, it means that this model has certain percentage (for this model, it is 86%) accuracy predicting the size of slime based on temperature and mol variables.

In [48]:
# Extract the coefficents and intercept from the model and write an equation for your h(x) using LaTeX
print("coefficents: ", reg.coef_)
print("intercept: ", reg.intercept_)

coefficents:  [ 883.78301691 1025.1479957 ]
intercept:  -416311.3108023002
coefficents:  883.7830169093772
coefficents:  1025.1479956990834


Sample equation: $E = mc^2$

## Part 4. Use Cross Validation

In [49]:
# Use the cross_val_score function to repeat your experiment across many shuffles of the data
scores = cross_val_score(LinearRegression(), features_x, label_y)

# Report on their finding and their significance
scores

array([0.83918826, 0.87051239, 0.85871066, 0.87202623, 0.84364641])

## Part 5. Using Polynomial Regression

In [50]:
# Using the PolynomialFeatures library perform another regression on an augmented dataset of degree 2

# logistic_regression = LogisticRegression()
# logistic_regression.fit(x_train, y_train)
# logistic_regression.score(x_test, y_test)

# Report on the metrics and output the resultant equation as you did in Part 3.

lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(y_train)
print(training_scores_encoded)
print(utils.multiclass.type_of_target(y_train))
print(utils.multiclass.type_of_target(y_train.astype('int')))
print(utils.multiclass.type_of_target(training_scores_encoded))

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, training_scores_encoded)
logistic_regression.score(x_test, y_test)

[139 726 467 782 235 780 303 461 272  51 615 419 395 729 246 621 535 829
 253 255 827 855 734 610 113 281 815 491 326 268 153 659 418 549  49 301
 635 205 317 846 302 889 464  74 868 543  68 771 254 332 774 587 708 217
 533 186 360 348 732 309  64 745 243 859 805 754 432   7 484 536  18 789
  65 459 107  66   3 839 229  67 356 714 232 689 522 155  86 251 489 444
 684 741  52 170 405  56 537 256 645 711 258 690 541 446 646 355 649 834
 407   4 600 843 480 144 803 560 331 860 273 441 682 125 437 231 497 620
 826 147 445 219 168 266 540 313 490 694 263 790  38 160 809 492 733 278
 642   9 455 233 676 297 722 575  93 765 248  76 882  75 470 758 193 452
 823 704 828 213 763 323 717 201 698 388 850 894 523 657 752 551 479 772
 381 270 568 794  71 239 230 700 545 408 604 861 555  46 358 468 596 788
 330 881 857 148 299 259 528 202   2 154 117  98 664 718 220 324 366 319
 801 149 798 118 496 538 542 163 180 411 312 181 176 389 482 890 475 612
  12 166 792 314 507 637 667 385 577  96  41 179 83

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: Classification metrics can't handle a mix of continuous and multiclass targets

In [None]:
import numpy as np
from sklearn                        import metrics, svm
from sklearn.linear_model           import LogisticRegression
from sklearn import preprocessing
from sklearn import utils

training_data_X    = np.array([ [1.2, 6.7, 2.7],  [2.3, 4.6, 2.2],  [0.3, 3.9, 0.8],  [2.1, 1.3, 4.3]  ])
training_scores_Y  = np.array( [1.4, 9.2, 2.5, 2.2] )
prediction_data_test  = np.array([ [1.5, 3.4, 2.2],  [7.6, 7.2, 0.2] ])

clf = LogisticRegression()
clf.fit(training_data_X, training_scores_Y)



In [None]:
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(training_scores_Y)
print(training_scores_encoded)
print(utils.multiclass.type_of_target(training_scores_Y))
print(utils.multiclass.type_of_target(training_scores_Y.astype('int')))
print(utils.multiclass.type_of_target(training_scores_encoded))

clf = LogisticRegression()
clf.fit(training_data_X, training_scores_encoded)
print("LogisticRegression")
print(clf.predict(prediction_data_test))