<a href="https://colab.research.google.com/github/hucarlos08/GEO-ML/blob/main/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the Titanic dataset
titanic = sns.load_dataset('titanic')

# Select some features
features = ['age', 'fare', 'sex', 'class', 'embarked']
X = titanic[features]
y = titanic['survived']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numeric columns (replace NaNs with median and scale values)
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features (replace NaNs with 'missing' and one-hot encode)
categorical_features = ['sex', 'class', 'embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create a preprocessing and classifier pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', GaussianNB())])

# Fit the pipeline to the training data
clf.fit(X_train, y_train)

# Use the trained pipeline to make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f'Naive Bayes model accuracy: {accuracy:.2f}')


Naive Bayes model accuracy: 0.66


In [6]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Load the digits dataset
digits = load_digits()

# Create features and target arrays
X = digits.data
y = digits.target

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LDA and reduce the number of dimensions
lda = LDA(n_components=5)

# Create a Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Create a pipeline that applies LDA then GNB
clf = Pipeline(steps=[('lda', lda),
                      ('gnb', gnb)])

# Fit the pipeline to the training data
clf.fit(X_train, y_train)

# Use the trained pipeline to make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f'Combined LDA and Naive Bayes model accuracy: {accuracy:.2f}')


Combined LDA and Naive Bayes model accuracy: 0.92


## Missing values??

In [12]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load pima diabetes dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names)

# Introduce some missing values
np.random.seed(0)
mask = np.random.randint(0, 2, size=dataframe[names[:-1]].shape).astype(bool)
dataframe[names[:-1]] = dataframe[names[:-1]].where(mask)

# Prepare target and features
X = dataframe.drop(columns='class')
y = dataframe['class']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a GaussianNB with SimpleImputer
model = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('classifier', GaussianNB())
])

# Fit on training data
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy of model with missing data: {accuracy:.2f}')


Accuracy of model with missing data: 0.69
