Classify if Demented, Nondemented

In [1]:
#set up a ColumnTransformer with StandardScaler for numerical features and OneHotEncoder for categorical features.
#set up and training a LinearRegression model using scikit-learn, including data preprocessing steps within a Pipeline.
#implement polynomial regression
#perform hyperparameter tuning for a polynomial regression model
#evaluate the performance of a regression model on test data
#use OneHotEncoder with handle_unknown='ignore' within a preprocessing pipeline to handle unseen categories during model training and evaluation
#set up and execute cross_val_score or GridSearchCV to perform cross-validation

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


data = fetch_california_housing(as_frame=True)
df = data.frame


df['Region'] = pd.cut(df['Latitude'], bins=3, labels=['North', 'Central', 'South'])

# 2. Feature separation
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Define columns
numerical_cols = X.select_dtypes(include=['float64', 'int']).columns.tolist()
categorical_cols = ['Region']

# 5. Preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# 6. Pipeline: Linear Regression
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 7. Fit Linear Regression
linear_pipeline.fit(X_train, y_train)
y_pred = linear_pipeline.predict(X_test)

print("Linear Regression Results:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

# 8. Polynomial Regression with pipeline
poly_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())
])

# 9. Grid Search for best polynomial degree
param_grid = {
    'poly__degree': [1, 2, 3]
}

grid_search = GridSearchCV(poly_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest polynomial degree:", grid_search.best_params_['poly__degree'])

# 10. Evaluation on test data using best model
best_poly_model = grid_search.best_estimator_
y_poly_pred = best_poly_model.predict(X_test)

print("Polynomial Regression Results:")
print("MSE:", mean_squared_error(y_test, y_poly_pred))
print("R²:", r2_score(y_test, y_poly_pred))

# 11. Cross-validation scores
cv_scores = cross_val_score(best_poly_model, X, y, cv=5, scoring='r2')
print("\nCross-validation R² scores:", cv_scores)
print("Mean CV R²:", np.mean(cv_scores))


Linear Regression Results:
MSE: 0.551304620677184
R²: 0.579288123149722

Best polynomial degree: 1
Polynomial Regression Results:
MSE: 0.551304620677184
R²: 0.579288123149722

Cross-validation R² scores: [0.54889665 0.47208401 0.55769937 0.52896402 0.65483872]
Mean CV R²: 0.5524965533254478


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv(r'C:\Users\shubham\OneDrive\Desktop\j\dementia.csv')

# Drop rows with missing target values
df = df.dropna(subset=['MMSE'])

# Separate features and target
X = df.drop('MMSE', axis=1)
y = df['MMSE']

# Select only numeric columns
numeric_cols = X.select_dtypes(include=[float, int]).columns

# Fill missing values in numeric columns with the mean
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numerical values
        ('scaler', StandardScaler())  # Scale the numerical features
    ]), numerical_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
        ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One hot encode categorical features
    ]), categorical_features)
])


pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('model', LinearRegression())
])

param_grid = {
    'poly__degree': [1, 2, 3] 
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2', n_jobs=-1)  
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

print("Best polynomial degree:", grid_search.best_params_['poly__degree'])
print("Best score from CV:", grid_search.best_score_)
print("Test R^2 Score:", r2_score(y_test, y_pred))
print("Test MSE:", mean_squared_error(y_test, y_pred))


cv_scores = grid_search.cv_results_['mean_test_score']
degrees = param_grid['poly__degree']
print("\nScores for each degree:")
for degree, score in zip(degrees, cv_scores):
    print(f"Degree {degree}: R² = {score:.4f}")


Best polynomial degree: 1
Best score from CV: 0.5754706814791134
Test R^2 Score: 0.6203531973214178
Test MSE: 5.821250974404927

Scores for each degree:
Degree 1: R² = 0.5755
Degree 2: R² = 0.5068
Degree 3: R² = 0.3723
