# Notebook ICD - 10

## From scratch to scikit-learn

Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.axes as ax

Load the dataset and separate input and Target variables

In [None]:
data = pd.read_csv('data_for_lr.csv')
 
# Drop the missing values
data = data.dropna()
 
# training dataset and labels
train_input = np.array(data.x[0:500]).reshape(500,1)
train_output  = np.array(data.y[0:500]).reshape(500,1)
 
# valid dataset and labels
test_input = np.array(data.x[500:700]).reshape(199,1)
test_output  = np.array(data.y[500:700]).reshape(199,1)

Build the Linear Regression Model

In [None]:
class LinearRegression:
	def __init__(self):
		self.parameters = {}
	
	def forward_propagation(self, train_input):
		m = self.parameters['m']
		c = self.parameters['c']
		predictions = np.multiply(m, train_input) + c
		return predictions

	def cost_function(self, predictions, train_output):
		cost = np.mean((train_output - predictions) ** 2)
		return cost

	def backward_propagation(self, train_input, train_output, predictions):
		derivatives = {}
		df = (train_output - predictions) * -1
		dm = np.mean(np.multiply(train_input, df))
		dc = np.mean(df)
		derivatives['dm'] = dm
		derivatives['dc'] = dc
		return derivatives

	def update_parameters(self, derivatives, learning_rate):
		self.parameters['m'] = self.parameters['m'] - learning_rate * derivatives['dm']
		self.parameters['c'] = self.parameters['c'] - learning_rate * derivatives['dc']

	def train(self, train_input, train_output, learning_rate, iters):
		#initialize random parameters
		self.parameters['m'] = np.random.uniform(0,1) * -1
		self.parameters['c'] = np.random.uniform(0,1) * -1
		
		#initialize loss
		self.loss = []
		
		#iterate
		for i in range(iters):
			#forward propagation
			predictions = self.forward_propagation(train_input)

			#cost function
			cost = self.cost_function(predictions, train_output)

			#append loss and print
			self.loss.append(cost)
			print("Iteration = {}, Loss = {}".format(i+1, cost))

			#back propagation
			derivatives = self.backward_propagation(train_input, train_output, predictions)

			#update parameters
			self.update_parameters(derivatives, learning_rate)

		return self.parameters, self.loss

Train the model

In [None]:
#Example usage
linear_reg = LinearRegression()
parameters, loss = linear_reg.train(train_input, train_output, 0.0001, 20)

Final Prediction and Plot the regression line

In [None]:
#Prediction on test data
print(parameters['m'])
print(parameters['c'])
y_pred = test_input*parameters['m'] + parameters['c']

In [None]:
# Plot the regression line with actual data pointa
plt.plot(test_input, test_output, '+', label='Actual values')
plt.plot(test_input, y_pred, label='Predicted values')
plt.xlabel('Test input')
plt.ylabel('Test Output or Predicted output')
plt.legend()
plt.show()

### Using Sklearn

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_input, train_output)
print(reg.score(train_input, train_output))

In [None]:
#Prediction on test data
print(reg.coef_)
print(reg.intercept_)
y_pred = reg.predict(test_input)

In [None]:
# Plot the regression line with actual data pointa
plt.plot(test_input, test_output, '+', label='Actual values')
plt.plot(test_input, y_pred, label='Predicted values')
plt.xlabel('Test input')
plt.ylabel('Test Output or Predicted output')
plt.legend()
plt.show()

## Comparing algorithms for regression

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the diabetes dataset from Scikit-learn
diabetes = load_diabetes()

# Convert the data into a pandas DataFrame
data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
data['target'] = diabetes.target

# Display the first few rows of the dataset
data.head()

In [None]:
# General dataset information
data.info()

# Descriptive statistics of the dataset
data.describe()

# Heatmap showing the correlation between features and the target variable
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between features and diabetes progression')
plt.show()

# Pairplot to visualize relationships between target and some features
sns.pairplot(data[['target', 'bmi', 's5', 'bp']])
plt.show()

In [None]:
# Splitting data into training and testing sets
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.head())
print(y_train.head())

In [None]:
# Function to evaluate models and plot predictions
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print(f'Model: {model_name}')
    print(f'Train R^2: {r2_score(y_train, y_pred_train):.4f}')
    print(f'Test R^2: {r2_score(y_test, y_pred_test):.4f}')
    print(f'Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}')
    print('-'*40)
    
    # Plot the predictions vs real values
    plt.figure(figsize=(4, 4))
    plt.scatter(y_test, y_pred_test, label='Predictions', color='blue')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted - {model_name}')
    plt.legend()
    plt.show()

In [None]:
# 1. Linear Regression
linear_reg = LinearRegression()
evaluate_model(linear_reg, X_train, X_test, y_train, y_test, 'Linear Regression')

# 2. Ridge Regression
ridge_reg = Ridge(alpha=1.0)
evaluate_model(ridge_reg, X_train, X_test, y_train, y_test, 'Ridge Regression')

# 3. Lasso Regression
lasso_reg = Lasso(alpha=0.1)
evaluate_model(lasso_reg, X_train, X_test, y_train, y_test, 'Lasso Regression')

# 4. Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
evaluate_model(rf_reg, X_train, X_test, y_train, y_test, 'Random Forest Regression')

In [None]:
# Model comparison based on R^2 score for training and testing sets
models = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Random Forest']
train_scores = [
    r2_score(y_train, linear_reg.predict(X_train)),
    r2_score(y_train, ridge_reg.predict(X_train)),
    r2_score(y_train, lasso_reg.predict(X_train)),
    r2_score(y_train, rf_reg.predict(X_train))
]
test_scores = [
    r2_score(y_test, linear_reg.predict(X_test)),
    r2_score(y_test, ridge_reg.predict(X_test)),
    r2_score(y_test, lasso_reg.predict(X_test)),
    r2_score(y_test, rf_reg.predict(X_test))
]

# Bar plot comparing R^2 score for all models
fig, ax = plt.subplots(figsize=(10, 6))
index = np.arange(len(models))
bar_width = 0.35

bar1 = ax.bar(index, train_scores, bar_width, label='Train R^2')
bar2 = ax.bar(index + bar_width, test_scores, bar_width, label='Test R^2')

ax.set_xlabel('Model')
ax.set_ylabel('R^2 Score')
ax.set_title('Model performance comparison')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(models)
ax.legend()

plt.tight_layout()
plt.show()

## Other example, discrete target?

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the wine quality dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=';')

# Display the first few rows of the dataset
data.head()

In [None]:
# Check for missing values
data.isnull().sum()

# Descriptive statistics
data.describe()

# Correlation heatmap to explore relationships between features and the target
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between features and wine quality')
plt.show()

# Pairplot to visualize relationships between important features and quality
sns.pairplot(data[['quality', 'alcohol', 'sulphates', 'pH', 'residual sugar']])
plt.show()

In [None]:
# Define features (X) and target (y)
X = data.drop('quality', axis=1)
y = data['quality']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Function to evaluate models and visualize results
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print(f'Model: {model_name}')
    print(f'Train R^2: {r2_score(y_train, y_pred_train):.4f}')
    print(f'Test R^2: {r2_score(y_test, y_pred_test):.4f}')
    print(f'Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}')
    print('-'*40)
    
    # Plot the actual vs predicted values
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_pred_test, label='Predictions', color='blue')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted - {model_name}')
    plt.legend()
    plt.show()

In [None]:
# 1. Linear Regression
linear_reg = LinearRegression()
evaluate_model(linear_reg, X_train, X_test, y_train, y_test, 'Linear Regression')

# 2. Ridge Regression
ridge_reg = Ridge(alpha=1.0)
evaluate_model(ridge_reg, X_train, X_test, y_train, y_test, 'Ridge Regression')

# 3. Lasso Regression
lasso_reg = Lasso(alpha=0.1)
evaluate_model(lasso_reg, X_train, X_test, y_train, y_test, 'Lasso Regression')

# 4. Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
evaluate_model(rf_reg, X_train, X_test, y_train, y_test, 'Random Forest Regression')

In [None]:
# Compare model performance based on R^2 scores for training and testing sets
models = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Random Forest']
train_scores = [
    r2_score(y_train, linear_reg.predict(X_train)),
    r2_score(y_train, ridge_reg.predict(X_train)),
    r2_score(y_train, lasso_reg.predict(X_train)),
    r2_score(y_train, rf_reg.predict(X_train))
]
test_scores = [
    r2_score(y_test, linear_reg.predict(X_test)),
    r2_score(y_test, ridge_reg.predict(X_test)),
    r2_score(y_test, lasso_reg.predict(X_test)),
    r2_score(y_test, rf_reg.predict(X_test))
]

# Bar plot comparing R^2 score for all models
fig, ax = plt.subplots(figsize=(10, 6))
index = np.arange(len(models))
bar_width = 0.35

bar1 = ax.bar(index, train_scores, bar_width, label='Train R^2')
bar2 = ax.bar(index + bar_width, test_scores, bar_width, label='Test R^2')

ax.set_xlabel('Model')
ax.set_ylabel('R^2 Score')
ax.set_title('Model performance comparison')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(models)
ax.legend()

plt.tight_layout()
plt.show()

## Linear Regression vs. Multivariate Regression

Difference between **Linear Regression** and **Multivariate Regression** using the **Wine Quality** dataset. Both types of regression are widely used for predictive modeling but differ in the number of predictors (features) used.

### Key Differences:
- **Linear Regression**: Models the relationship between a single independent variable and the dependent variable.
- **Multivariate Regression**: Models the relationship between multiple independent variables and the dependent variable.

# Linear Regression (Univariate)

Linear Regression is a simple model where only one feature (independent variable) is used to predict a single target (dependent variable). The relationship is represented by a straight line and assumes a linear relationship between the feature and the target.

For example, we can use **alcohol content** to predict the wine quality. The formula for Linear Regression is:

\[
\text{quality} = \beta_0 + \beta_1 \times \text{alcohol}
\]

Where:
- \( \beta_0 \) is the intercept (where the line crosses the y-axis).
- \( \beta_1 \) is the coefficient (or weight) for the alcohol feature.

In [None]:
# Example code: Linear Regression with one feature (alcohol)
X_uni = data[['alcohol']]
y = data['quality']
X_train, X_test, y_train, y_test = train_test_split(X_uni, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)

print(f'R^2: {r2_score(y_test, y_pred)}')


### Multivariate Regression

In Multivariate Regression, we predict the target variable using multiple features instead of just one. This helps capture the complex relationships between multiple factors that influence the outcome. The formula for Multivariate Regression is:

\[
\text{quality} = \beta_0 + \beta_1 \times \text{alcohol} + \beta_2 \times \text{sulphates} + \dots + \beta_n \times \text{feature}_n
\]

Each feature \( \text{feature}_n \) contributes to the prediction, with corresponding coefficients \( \beta_n \) that represent the importance of each feature in determining the target value.


In [None]:
# Example code: Multivariate Regression with multiple features
X_multi = data.drop('quality', axis=1)
y = data['quality']
X_train, X_test, y_train, y_test = train_test_split(X_multi, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)

print(f'R^2: {r2_score(y_test, y_pred)}')


### Visual Representation of Linear vs. Multivariate Regression

- **Linear Regression**: The relationship between the predictor and the target is represented as a straight line in 2D space.
- **Multivariate Regression**: The relationship is modeled in multi-dimensional space, but we cannot visualize it in 2D. Instead, we can think of it as a hyperplane in n-dimensional space (where \( n \) is the number of features).

In practice, Multivariate Regression often leads to better predictions when multiple factors contribute to the outcome.


## Conclusion

The key difference between **Linear Regression** and **Multivariate Regression** lies in the number of features used:
- **Linear Regression** involves just one feature to predict the target.
- **Multivariate Regression** uses multiple features to predict the target, which generally improves the prediction accuracy.

Multivariate Regression is especially useful in real-world datasets where several factors influence the outcome, as in the case of wine quality, where many physicochemical properties affect the final quality rating.