## Preliminary Classification

In [16]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MyDummyClassifier, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

In [21]:
flights = MyPyTable()
flights.load_from_file('normalized_flights.csv')

# drop first two columns
flights = MyPyTable(column_names=flights.column_names[2:], data=[row[2:] for row in flights.data])
flights.drop_rows([-1])
flights.pretty_print()

  month    day    dep_time    sched_dep_time    dep_delay  arr_time      sched_arr_time  arr_delay    carrier      flight  tailnum    origin    dest    air_time      distance    hour    minute  time_hour            name
-------  -----  ----------  ----------------  -----------  ----------  ----------------  -----------  ---------  --------  ---------  --------  ------  ----------  ----------  ------  --------  -------------------  ------------------------
      8     18        1826              1830           -4  2053.0                  2105  -12.0        WN              200  N949WN     EWR       MSY     175.0             1167      18        30  2013-08-18 18:00:00  Southwest Airlines Co.
     10     15        1651              1700           -9  2003.0                  2000  3.0          AA             1171  N3JNAA     LGA       DFW     207.0             1389      17         0  2013-10-15 17:00:00  American Airlines Inc.
      9     25         558               600           -2  658.0

In [18]:
flights.convert_to_numeric()
flights.remove_rows_with_missing_values()

# X is all the data, minus the dep_delay column that will be our class prediction
X = [row[:4] + row[5:] for row in flights.data]
X = [[row[i] for row in X] for i in range(len(X[0]))]

# discretize dep_delay values
y = []
not_discretized_y = flights.get_column('dep_delay')
for i in range(len(not_discretized_y)):
    y.append(myutils.discretize_delays(flights.data[i][4]))
    


In [None]:
import operator

# Dummy utility functions
def compute_euclidean_distance(row1, row2):
    return sum((r1 - r2) ** 2 for r1, r2 in zip(row1, row2)) ** 0.5

# Simulate the myutils module
class MyUtils:
    @staticmethod
    def compute_euclidean_distance(row1, row2):
        return compute_euclidean_distance(row1, row2)

myutils = MyUtils()

# Define the classifier as you provided earlier
class MyKNeighborsClassifier:
    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def kneighbors(self, X_test, categorical=False):
        distances = []
        neighbor_indices = []
        for test_instance in X_test:
            row_indexes_dists = []
            for i, row in enumerate(self.X_train):
                dist = myutils.compute_euclidean_distance(row, test_instance)
                row_indexes_dists.append((i, dist))
            row_indexes_dists.sort(key=operator.itemgetter(-1))
            top_k = row_indexes_dists[:self.n_neighbors]
            distances.append([dist for _, dist in top_k])
            neighbor_indices.append([i for i, _ in top_k])
        return distances, neighbor_indices

    def predict(self, X_test, categorical=False):
        _, neighbor_indices = self.kneighbors(X_test, categorical)
        y_predicted = []
        for indices in neighbor_indices:
            neighbor_labels = [self.y_train[i] for i in indices]
            most_common_label = max(set(neighbor_labels), key=neighbor_labels.count)
            y_predicted.append(most_common_label)
        return y_predicted

# Example usage
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = {
    "id": [296821, 40682, 331240, 309469, 226841, 289430, 133922, 192714, 138673, 12258],
    "year": [2013] * 10,
    "month": [8, 10, 9, 9, 6, 8, 2, 4, 3, 1],
    "day": [18, 15, 25, 1, 6, 11, 26, 30, 3, 15],
    "hour": [18, 17, 6, 12, 6, 6, 15, 9, 18, 6],
    "minute": [30, 0, 0, 8, 30, 0, 20, 32, 40, 30],
    "distance": [1167, 1389, 214, 2475, 1416, 212, 1096, 187, 2422, 282],
    "dep_delay": [-4, -9, -2, -3, -3, -4, -6, -2, -4, -5],
}

df = pd.DataFrame(data)

# Features and target
X = df[["year", "month", "day", "hour", "minute", "distance"]].values
y = df["dep_delay"].values

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
knn = MyKNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Display results
print("Predictions:", y_pred)
print("Actual:", y_test)


In [19]:
knn_classifier = MyKNeighborsClassifier()
nb_classifier = MyNaiveBayesClassifier()
dummy_classifier = MyDummyClassifier()

knn_accuracy, knn_error_rate, knn_y_pred_cv, knn_y_true = myevaluation.cross_val_predict(knn_classifier, X, y, categorical=True)
nb_accuracy, nb_error_rate, nb_y_pred_cv, nb_y_true = myevaluation.cross_val_predict(nb_classifier, X, y)
dummy_accuracy, dummy_error_rate, dummy_y_pred_cv, dummy_y_true = myevaluation.cross_val_predict(dummy_classifier, X, y)


print(f"k Nearest Neighbors Classifier: accuracy = {knn_accuracy:.2f}, error rate = {knn_error_rate:.2f}")
print(f"Naive Bayes Classifier: accuracy = {nb_accuracy:.2f}, error rate = {nb_error_rate:.2f}")


k Nearest Neighbors Classifier: accuracy = 1.00, error rate = 0.00
Naive Bayes Classifier: accuracy = 1.00, error rate = 0.00


### Trying different subsets of features

In [20]:
X_diff_features = [flights.get_column('day'), flights.get_column('sched_dep_time'), flights.get_column('carrier'), flights.get_column('distance')]
X_diff_features = [[row[i] for row in X_diff_features] for i in range(len(X_diff_features[0]))]

knn_classifier = MyKNeighborsClassifier()
nb_classifier = MyNaiveBayesClassifier()

knn_accuracy, knn_error_rate, knn_y_pred_cv, knn_y_true = myevaluation.cross_val_predict(knn_classifier, X_diff_features, y, categorical=True)
nb_accuracy, nb_error_rate, nb_y_pred_cv, nb_y_true = myevaluation.cross_val_predict(nb_classifier, X_diff_features, y)

print(f"k Nearest Neighbors Classifier: accuracy = {knn_accuracy:.2f}, error rate = {knn_error_rate:.2f}")
print(f"Naive Bayes Classifier: accuracy = {nb_accuracy:.2f}, error rate = {nb_error_rate:.2f}")


KeyboardInterrupt: 