In [13]:
import warnings
# Suppress specific warning related to KMeans
#warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak on Windows with MKL")
warnings.filterwarnings("ignore")

# Regression (Linear Regression)

In [14]:
# Import necessary libraries
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the California Housing dataset
data = fetch_california_housing()
X = data.data  # features
y = data.target  # target variable (housing prices)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.5558915986952421


In [15]:
# Import necessary libraries
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load the California Housing dataset
data = fetch_california_housing(as_frame=True)
X = data.data  # features as a DataFrame
y = data.target  # target variable (housing prices)

# Assume there's a categorical feature for demonstration
# Adding a synthetic categorical feature for illustration
X['ocean_proximity'] = pd.cut(X['MedInc'], bins=3, labels=["near", "far", "very_far"])

# Identify categorical and numerical features
categorical_features = ['ocean_proximity']
numerical_features = X.drop(columns=categorical_features).columns.tolist()

# Define transformers for different types of features

# Categorical Feature Engineering: One-Hot Encoding
categorical_transformer = OneHotEncoder()

# Numerical Feature Engineering: Polynomial Features and Standardization
numerical_transformer = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a pipeline with preprocessor and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

# Define a grid of hyperparameters to tune
param_grid = {
    'preprocessor__num__poly_features__degree': [1, 2, 3],  # Tuning polynomial degree
    'model__alpha': [0.01, 0.1, 1, 10, 100]  # Tuning regularization strength
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_

# Make predictions with the best model
predictions = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
print(f'Best Parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=2; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=2; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=2; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=2; total time=   0.0s
[CV] END model__alpha=0.01, preprocessor__num__poly_features__degree=2; total time=   0.0s
[CV] END model__alpha=0.01, p

# Classification  (e.g., Logistic Regression)

In [16]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the Iris dataset
data = load_iris()
X = data.data  # features
y = data.target  # target variable (species)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LogisticRegression(max_iter=200)  # max_iter is set to ensure convergence

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

conf_matrix = confusion_matrix(y_test, predictions)
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 1.0
Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [17]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# Load the Iris dataset
data = load_iris(as_frame=True)
X = data.data  # features as a DataFrame
y = data.target  # target variable (species)

# Assume there's a categorical feature for demonstration
# Adding a synthetic categorical feature for illustration
X['sepal_category'] = pd.cut(X['sepal length (cm)'], bins=3, labels=["short", "medium", "long"])

# Identify categorical and numerical features
categorical_features = ['sepal_category']
numerical_features = X.drop(columns=categorical_features).columns.tolist()

# Define transformers for different types of features

# Categorical Feature Engineering: One-Hot Encoding
categorical_transformer = OneHotEncoder()

# Numerical Feature Engineering: Polynomial Features and Standardization
numerical_transformer = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a pipeline with preprocessor and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=200))
])

# Define a grid of hyperparameters to tune
param_grid = {
    'preprocessor__num__poly_features__degree': [1, 2],  # Tuning polynomial degree
    'model__C': [0.01, 0.1, 1, 10, 100],  # Tuning regularization strength
    'model__penalty': ['l1', 'l2'],  # Tuning the type of regularization (L1 or L2)
    'model__solver': ['liblinear', 'saga']  # Different solvers for Logistic Regression
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_

# Make predictions with the best model
predictions = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print(f'Best Parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END model__C=0.01, model__penalty=l1, model__solver=liblinear, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l1, model__solver=liblinear, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l1, model__solver=liblinear, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l1, model__solver=liblinear, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l1, model__solver=liblinear, preprocessor__num__poly_features__degree=1; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l1, model__solver=liblinear, preprocessor__num__poly_features__degree=2; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l1, model__solver=liblinear, preprocessor__num__poly_features__degree=2; total time=   0.0s
[CV] END model__C=0.0

# Clustering (e.g., K-Means Clustering)

In [18]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import pandas as pd

# Load the Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the KMeans model
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)

# Train the KMeans model
kmeans.fit(X_scaled)

# Predict the cluster labels
cluster_labels = kmeans.predict(X_scaled)

# Evaluate the clustering using silhouette score
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print(f'Silhouette Score: {silhouette_avg}')

# Print cluster centers
print('Cluster Centers:')
print(kmeans.cluster_centers_)

Silhouette Score: 0.45994823920518635
Cluster Centers:
[[-0.05021989 -0.88337647  0.34773781  0.2815273 ]
 [-1.01457897  0.85326268 -1.30498732 -1.25489349]
 [ 1.13597027  0.08842168  0.99615451  1.01752612]]


In [19]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, make_scorer
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Load the Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)

# Assume there's a categorical feature for demonstration
# Adding a synthetic categorical feature for illustration
X['sepal_category'] = pd.cut(X['sepal length (cm)'], bins=3, labels=["short", "medium", "long"])

# Identify categorical and numerical features
categorical_features = ['sepal_category']
numerical_features = X.drop(columns=categorical_features).columns.tolist()

# Define transformers for different types of features

# Categorical Feature Engineering: One-Hot Encoding
categorical_transformer = OneHotEncoder()

# Numerical Feature Engineering: Standardization
numerical_transformer = StandardScaler()

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# Initialize a pipeline with preprocessor and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_init='auto'))
])

# Define a grid of hyperparameters to tune
param_grid = {
    'kmeans__n_clusters': [2, 3, 4, 5],  # Tuning number of clusters
    'kmeans__init': ['k-means++', 'random'],  # Tuning initialization method
    'kmeans__max_iter': [300, 400, 500]  # Tuning the maximum number of iterations
}

# Create a custom scorer based on silhouette score
def silhouette_scorer(estimator, X):
    X_transformed = estimator.named_steps['preprocessor'].transform(X)
    labels = estimator.named_steps['kmeans'].predict(X_transformed)
    return silhouette_score(X_transformed, labels)

custom_scorer = make_scorer(silhouette_scorer, greater_is_better=True)

# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=custom_scorer, verbose=2)

# Fit the model with grid search
grid_search.fit(X)

# Best model after tuning
best_model = grid_search.best_estimator_

# Predict the cluster labels for the data
X_transformed = best_model.named_steps['preprocessor'].transform(X)
cluster_labels = best_model.named_steps['kmeans'].predict(X_transformed)

# Evaluate the model
silhouette_avg = silhouette_score(X_transformed, cluster_labels)
print(f'Silhouette Score: {silhouette_avg}')
print(f'Best Parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=2; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=2; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=2; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=2; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=2; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=3; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=3; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=3; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=3; total time=   0.0s
[CV] END kmeans__init=k-means++, kmeans__max_iter=300, kmeans__n_clusters=