In [3]:
import numpy as np
import csv
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier


### Load data

def read_csv(file):
    with open(file, newline='') as f:
        reader = csv.reader(f)
        line_count = 0
        rows = []
        for row in reader:
            if line_count == 0:
                titles = row
            else:
                rows.append(row)
            line_count += 1
    rows_int = np.array([[int(r) for r in row] for row in rows])
    return titles, rows_int
        

titles, rows_train =  read_csv(os.path.join(os.getcwd(), 'studentsdigits-train.csv'))
assert titles[-1] == 'Digit' and len(titles) == 9, 'Not train set'
X_train = rows_train[:,0:len(titles)-1]
Y_train = rows_train[:,-1]
print(X_train.shape)
print(Y_train.shape)

titles, rows_test = read_csv(os.path.join(os.getcwd(), "studentsdigits-test.csv"))
assert len(titles) == 8, 'Not test set'
X_test = rows_test
print(X_test.shape)

# Feature Engineering: Create additional features based on the pen positions
def create_features(X):
    # Calculate distances between consecutive points
    distances = np.sqrt(np.diff(X[:, ::2], axis=1)**2 + np.diff(X[:, 1::2], axis=1)**2)
    # Calculate angles between consecutive points
    angles = np.arctan2(np.diff(X[:, 1::2], axis=1), np.diff(X[:, ::2], axis=1))
    return np.hstack((X, distances, angles))

# Apply feature engineering to the training and test sets
X_train_fe = create_features(X_train)
X_test_fe = create_features(X_test)

# Split the training data into training and validation sets
X_train_fe, X_val_fe, Y_train, Y_val = train_test_split(X_train_fe, Y_train, test_size=0.1, random_state=42)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': [300, 400, 500],
    'max_depth': [None, 30, 40],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2]
}

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Initialize the Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2)

# Fit the random search to the data
random_search.fit(X_train_fe, Y_train)

# Print the best parameters
print(f'Best parameters found: {random_search.best_params_}')

# Use the best estimator for further predictions
best_rf_model = random_search.best_estimator_

# Evaluate the best Random Forest model on the validation set
rf_val_predictions = best_rf_model.predict(X_val_fe)
rf_val_accuracy = accuracy_score(Y_val, rf_val_predictions)

# Print the final validation accuracy for Random Forest
print(f'Final validation accuracy (Random Forest): {rf_val_accuracy * 100:.2f}%')

# Initialize a simple neural network model
nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=200, random_state=42)

# Train the neural network model
nn_model.fit(X_train_fe, Y_train)

# Evaluate the neural network model on the validation set
nn_val_predictions = nn_model.predict(X_val_fe)
nn_val_accuracy = accuracy_score(Y_val, nn_val_predictions)

# Print the final validation accuracy for Neural Network
print(f'Final validation accuracy (Neural Network): {nn_val_accuracy * 100:.2f}%')

# Choose the best model based on validation accuracy
best_model = best_rf_model if rf_val_accuracy > nn_val_accuracy else nn_model

# Predict on the test set with the best model
Y_test = best_model.predict(X_test_fe)

# Save the predictions to a text file
np.savetxt('upload_predictions.txt', Y_test, fmt='%d')


(3747, 8)
(3747,)
(3747, 8)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters found: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}
Final validation accuracy (Random Forest): 96.80%
Final validation accuracy (Neural Network): 97.87%
