<a href="https://colab.research.google.com/github/harikanemala/Machine-Learning/blob/main/Bias5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Sample data (synthetic)
data = {
    'Feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Feature2': [5, 4, 3, 2, 1, 10, 9, 8, 7, 6],
    'Target': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]
}

df = pd.DataFrame(data)

# 1. Removing Duplicates
df_no_duplicates = df.drop_duplicates()
print("Dataset after removing duplicates:")
print(df_no_duplicates)

# 2. Cross-Validation

# Features and target
X = df_no_duplicates.drop('Target', axis=1)
y = df_no_duplicates['Target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models for comparison (Linear Regression and Decision Tree)
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor()
}

# Perform 5-fold Cross Validation for each model
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    print(f"{model_name} - Cross-validation MSE scores: {scores}")
    print(f"{model_name} - Mean Cross-validation MSE: {scores.mean()}")
    print(f"{model_name} - Standard Deviation of Cross-validation MSE: {scores.std()}")

# 3. Bias and Variance Calculation

# Bias is measured using the error of the model on the training set
# Variance is measured by the difference between training error and test error

# Initialize models
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor()

# Train models
linear_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)

# Predict on test set
linear_predictions = linear_model.predict(X_test)
tree_predictions = tree_model.predict(X_test)

# Calculate MSE (Mean Squared Error) for both models on test data
linear_test_error = mean_squared_error(y_test, linear_predictions)
tree_test_error = mean_squared_error(y_test, tree_predictions)

# Training error for linear model
linear_train_predictions = linear_model.predict(X_train)
linear_train_error = mean_squared_error(y_train, linear_train_predictions)

# Training error for decision tree
tree_train_predictions = tree_model.predict(X_train)
tree_train_error = mean_squared_error(y_train, tree_train_predictions)

# Bias and Variance Calculation
def calculate_bias_variance(train_error, test_error):
    bias = train_error - test_error
    variance = test_error - bias
    return bias, variance

# Bias and Variance for both models
linear_bias, linear_variance = calculate_bias_variance(linear_train_error, linear_test_error)
tree_bias, tree_variance = calculate_bias_variance(tree_train_error, tree_test_error)

# Display Bias and Variance
print("\nBias and Variance for Linear Regression:")
print(f"Bias: {linear_bias}")
print(f"Variance: {linear_variance}")

print("\nBias and Variance for Decision Tree:")
print(f"Bias: {tree_bias}")
print(f"Variance: {tree_variance}")



Dataset after removing duplicates:
   Feature1  Feature2  Target
0         1         5       1
1         2         4       2
2         3         3       1
3         4         2       2
4         5         1       1
5         6        10       2
6         7         9       1
7         8         8       2
8         9         7       1
9        10         6       2
Linear Regression - Cross-validation MSE scores: [-0.27777778 -0.3067602  -1.         -0.3067602  -0.27777778]
Linear Regression - Mean Cross-validation MSE: -0.43381519274376423
Linear Regression - Standard Deviation of Cross-validation MSE: 0.2833889644972734
Decision Tree - Cross-validation MSE scores: [-0.5 -1.  -1.  -1.  -0.5]
Decision Tree - Mean Cross-validation MSE: -0.8
Decision Tree - Standard Deviation of Cross-validation MSE: 0.2449489742783178

Bias and Variance for Linear Regression:
Bias: -0.5632653061224491
Variance: 1.2979591836734696

Bias and Variance for Decision Tree:
Bias: -1.0
Variance: 2.0
