In [None]:
# scikit-learn pereptron and adaline implementations
from sklearn.linear_model import Perceptron as SklearnPerceptron
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

In [5]:
# Preprocessing
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load datasets
train_df = pd.read_csv("/Users/jodis/ML/Project1/project_adult.csv")
test_df = pd.read_csv("/Users/jodis/ML/Project1/project_validation_inputs.csv")

# Replace ? with NaN and fill with Unknown
train_df = train_df.replace("?", "Unknown")
test_df = test_df.replace("?", "Unknown")

# Identify categorical and numeric columns
categorical_cols = train_df.select_dtypes(include="object").columns.tolist()
if "income" in categorical_cols:
    categorical_cols.remove("income")
numeric_cols = train_df.select_dtypes(include="int64").columns.tolist()

# One-hot encode categorical variables
train_encoded = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Align test set to train set columns
test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)

# Standardize numerical features
scaler = StandardScaler()
train_encoded[numeric_cols] = scaler.fit_transform(train_encoded[numeric_cols])
test_encoded[numeric_cols] = scaler.transform(test_encoded[numeric_cols])

# Split features and target
X_train = train_encoded.drop(columns=["income"])
y_train = train_df["income"].map({">50K": 1, "<=50K": 0})
X_test = test_encoded.drop(columns=["income"], errors="ignore")
y_test = y_train.sample(len(X_test), random_state=42)

# Outputs
print("Preprocessing complete.")
print("Train shape:", train_encoded.shape)
print("Test shape:", test_encoded.shape)

Preprocessing complete.
Train shape: (26048, 102)
Test shape: (6513, 102)


In [7]:
# Custom Perceptron
import pandas as pd

class PerceptronCustom:
    def __init__(self, eta=0.1, n_iter=15, random_state=42):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = np.random.RandomState(random_state)

    def fit(self, X, y):
        self.w_ = self.random_state.normal(loc=0.0, scale=0.01, size=X.shape[1] + 1)
        self.errors_ = []

        for _ in range(self.n_iter):
            errors = 0
            for xi, target in zip(X, y):
                update = self.eta * (target - self.predict(xi))
                self.w_[1:] += update * xi
                self.w_[0] += update
                errors += int(update != 0.0)
            self.errors_.append(errors)
        return self
    
    def net_input(self, X):
        return np.dot(X, self.w_[1:]) + self.w_[0]
    
    def predict(self, X):
        return np.where(self.net_input(X) >= 0.0, 1, 0)

In [2]:
# Assumption:  This code assumes you have already clean, processed, and split a data set.
'''
Input Data Required:
X_train = cleaned, processed input data for training prediction models
y_train = response variable associated with X_train; truth values
X_test = cleaned, processed input data for testing prediction model created on training dataset
y_test = response variable associated with X_test; truth values

Note: Current order of rows ensure y_train = X_train and y_test = X_test, but if you sort and remove rows this association changes.
'''
# Scikit-learn Perceptron
sk_ppn = SklearnPerceptron(eta0=0.1, max_iter=15, random_state=42)
sk_ppn.fit(X_train, y_train)
sk_y_pred_ppn = sk_ppn.predict(X_test)
print(f"Scikit-learn Perceptron accuracy: {accuracy_score(y_test, sk_y_pred_ppn):.4f}")

# Scikit-learn Adaline (using SGDClassifier with 'log_loss' is closer, but 'perceptron' is available)
# For Adaline (which uses GD), SGDClassifier with 'hinge' loss is a good proxy, as it's a linear classifier with regularization
sk_ada = SGDClassifier(loss='perceptron', eta0=0.0001, learning_rate='constant', max_iter=15, random_state=42)
sk_ada.fit(X_train, y_train)
sk_y_pred_ada = sk_ada.predict(X_test)
print(f"Scikit-learn Adaline (SGDClassifier) accuracy: {accuracy_score(y_test, sk_y_pred_ada):.4f}")

NameError: name 'X_train' is not defined