# EE0005 Lab Quiz

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [None]:
abaData = pd.read_csv('quizData_SetA.csv')
abaData.head()

# Problem 1

In [None]:
abaData.describe()

In [None]:
# Draw the distributions of all variables
f, axes = plt.subplots(4, 3, figsize=(18, 16))
colors = ["r", "g", "b", "m"]

count = 0
for var in abaData:
    sb.boxplot(abaData[var], orient = "h", color = colors[count], ax = axes[count,0])
    sb.distplot(abaData[var], color = colors[count], ax = axes[count,1])
    sb.violinplot(abaData[var], color = colors[count], ax = axes[count,2])
    count += 1

Comment: Height and weight has the most "normal" looking distribution, diameter and length have max outliers.

In [None]:
# Correlation Matrix
print(abaData.corr())

# Heatmap of the Correlation Matrix
f, axes = plt.subplots(1, 1, figsize=(10, 10))
sb.heatmap(abaData.corr(), vmin = -1, vmax = 1,  linewidths = 1,
           annot = True, fmt = ".2f", annot_kws = {"size": 14}, cmap = "RdBu")

Comment: diameter has the highest absolute correlation with length. It's helpful in predicting length.

In [None]:
height = pd.DataFrame(abaData['Height'])
weight = pd.DataFrame(abaData['Weight'])
diameter = pd.DataFrame(abaData['Diameter'])
length = pd.DataFrame(abaData['Length'])

In [None]:
sb.jointplot(x = height, y = length, height = 8)

In [None]:
sb.jointplot(x = weight, y = length, height = 8)

In [None]:
sb.jointplot(x = diameter, y = length, height = 8)

In [None]:
# Draw pairs of variables against one another
sb.pairplot(data = abaData)

Comment: diameter has the highest absolute correlation with length. It's helpful in predicting length.

# Problem 2

In [None]:
# MSE
def mean_sq_err(actual, predicted):
    return np.mean(np.square(np.array(actual) - np.array(predicted)))


In [None]:
# Import LinearRegression model from Scikit-Learn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Length against height
# Split the Dataset into Train and Test
height_train, height_test, length_train, length_test = train_test_split(height, length, test_size = 0.3)

# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(height_train, length_train)

In [None]:
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Formula for the Regression line
regline_x = height_train
regline_y = linreg.intercept_ + linreg.coef_ * height_train

# Plot the Linear Regression line
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(height_train, length_train)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

In [None]:
# Predict length values corresponding to height
length_train_pred = linreg.predict(height_train)
length_test_pred = linreg.predict(height_test)

# Plot the Predictions
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(height_test, length_test, color = "green")
plt.scatter(height_test, length_test_pred, color = "red")
plt.show()

print("Mean Squared Error (MSE) on Train Set \t:", mean_sq_err(length_train, length_train_pred))
print("Mean Squared Error (MSE) on Test Set \t:", mean_sq_err(length_test, length_test_pred))
print("Explained Variance (R^2) on Train Set \t:", linreg.score(height_train, length_train))
print("Explained Variance (R^2) on Test Set \t:", linreg.score(height_test, length_test))

In [None]:
# Length against weight
# Split the Dataset into Train and Test
weight_train, weight_test, length_train, length_test = train_test_split(weight, length, test_size = 0.3)

# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(weight_train, length_train)

print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Formula for the Regression line
regline_x = weight_train
regline_y = linreg.intercept_ + linreg.coef_ * weight_train

# Plot the Linear Regression line
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(weight_train, length_train)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

In [None]:
# Predict length values corresponding to weight
length_train_pred = linreg.predict(weight_train)
length_test_pred = linreg.predict(weight_test)

# Plot the Predictions
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(weight_test, length_test, color = "green")
plt.scatter(weight_test, length_test_pred, color = "red")
plt.show()

print("Mean Squared Error (MSE) on Train Set \t:", mean_sq_err(length_train, length_train_pred))
print("Mean Squared Error (MSE) on Test Set \t:", mean_sq_err(length_test, length_test_pred))
print("Explained Variance (R^2) on Train Set \t:", linreg.score(weight_train, length_train))
print("Explained Variance (R^2) on Test Set \t:", linreg.score(weight_test, length_test))

In [None]:
# Length against diameter
# Split the Dataset into Train and Test
diameter_train, diameter_test, length_train, length_test = train_test_split(diameter, length, test_size = 0.3)

# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(diameter_train, length_train)

print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Formula for the Regression line
regline_x = diameter_train
regline_y = linreg.intercept_ + linreg.coef_ * diameter_train

# Plot the Linear Regression line
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(diameter_train, length_train)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

In [None]:
# Predict length values corresponding to diameter
length_train_pred = linreg.predict(diameter_train)
length_test_pred = linreg.predict(diameter_test)

# Plot the Predictions
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(diameter_test, length_test, color = "green")
plt.scatter(diameter_test, length_test_pred, color = "red")
plt.show()

print("Mean Squared Error (MSE) on Train Set \t:", mean_sq_err(length_train, length_train_pred))
print("Mean Squared Error (MSE) on Test Set \t:", mean_sq_err(length_test, length_test_pred))
print("Explained Variance (R^2) on Train Set \t:", linreg.score(diameter_train, length_train))
print("Explained Variance (R^2) on Test Set \t:", linreg.score(diameter_test, length_test))

Comment: diameter vs. length linear model best predicting length, since higher variance (close to 1) and lower MSE on train and test set means better predictions. Diameter vs. length linear model has the highest variance and lowest MSE values.

# Problem 3

In [None]:
lengthPredictor = pd.DataFrame(abaData[['Height','Weight','Diameter']])

In [None]:
# Split the Dataset into Train and Test
lengthPredictor_train, lengthPredictor_test, length_train, length_test = train_test_split(lengthPredictor, length, test_size = 0.3)

# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(lengthPredictor_train, length_train)

In [None]:
# Predict length values
length_train_pred = linreg.predict(lengthPredictor_train)
length_test_pred = linreg.predict(lengthPredictor_test)

print("Mean Squared Error (MSE) on Train Set \t:", mean_sq_err(length_train, length_train_pred))
print("Mean Squared Error (MSE) on Test Set \t:", mean_sq_err(length_test, length_test_pred))
print("Explained Variance (R^2) on Train Set \t:", linreg.score(lengthPredictor_train, length_train))
print("Explained Variance (R^2) on Test Set \t:", linreg.score(lengthPredictor_test, length_test))