In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  #data visualization library
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix  # evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures    # function to generate polynomial and interaction features
from sklearn.linear_model import LinearRegression, HuberRegressor    # classes providing Linear Regression with ordinary squared error loss and Huber loss, respectively
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


In [11]:
# Cleaning and modifying data

df = pd.read_csv('dementia_dataset.csv')
df.drop(columns = ['Subject ID', 'MRI ID', 'Hand', 'MR Delay'],inplace=True)
df.columns
df_one = pd.get_dummies(df["Group"])
df_two = pd.concat((df_one, df), axis=1)
df_two = df_two.drop(["Group"], axis=1)
df_two = df_two.drop(["Nondemented"], axis=1)
df_two = df_two.drop(['Converted'], axis=1)

df_three = pd.get_dummies(df_two["M/F"])
df_four = pd.concat((df_three, df_two), axis=1)
df_four = df_four.drop(['F'], axis=1)
df_four = df_four.drop(['M/F'], axis=1)
data = df_four.rename(columns={"M": "Gender"})
data = data[['Gender', 'Visit', 'Age', 'EDUC', 'SES', 'MMSE',
       'CDR', 'eTIV', 'nWBV', 'ASF', 'Demented']]
data['SES'] = data['SES'].fillna(np.mean(data['SES']))
data['MMSE'] = data['MMSE'].fillna(np.mean(data['MMSE']))

In [12]:
from sklearn.model_selection import train_test_split
lr = LogisticRegression()
scaler = StandardScaler()
X = data.iloc[:,:10].to_numpy()
y = data.iloc[:,-1].to_numpy().reshape(-1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=0)


In [14]:
lr.fit(scaler.fit_transform(X_train), y_train)
y_pred_train = lr.predict(scaler.fit_transform(X_train))
y_pred_val = lr.predict(scaler.fit_transform(X_val))
err_train = accuracy_score(y_train, y_pred_train)
err_val = accuracy_score(y_val, y_pred_val)
print(f"Training error: {err_train}")
print(f"Validation error: {err_val}")


Training error: 0.9697986577181208
Validation error: 0.96


In [15]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())])
pipe.fit(X_train, y_train)
err_train_pip = pipe.score(X_train, y_train)
err_val_pip = pipe.score(X_val, y_val)
# Print training and validation errors
print(f"Training error with pipeline: {err_train_pip}")
print(f"Validation error pipeline: {err_val_pip}")

Training error with pipeline: 0.9697986577181208
Validation error pipeline: 0.9733333333333334


27.342318059299195