# I. Algorithm

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [82]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Start with zeros for weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0

        for i in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update weights
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict_probability(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X):
        y_probs = self.predict_probability(X)
        return [1 if prob >= 0.5 else 0 for prob in y_probs]

# II. Application to Data Set

In [62]:
#coaster_data = pd.read_csv("../coaster_db.csv")

#coaster_subset = coaster_data[['height_ft','speed_mph','Type']]

#coaster_subset = coaster_subset.dropna()

#wood_coasters = coaster_subset[coaster_subset['Type'].str.contains('Wood', case=False, na=False)]

#steel_coasters = coaster_subset[~coaster_subset['Type'].str.contains('Wood', case=False, na=False)]

coaster_data = pd.read_csv("../coaster_db.csv")

coaster_subset = coaster_data[['height_ft','speed_mph','Inversions_clean','Gforce_clean']]

clean_data = coaster_subset.dropna()

# Creating column for whether coaster has at least one inversion or not

clean_data['Inverted'] = clean_data['Inversions_clean'].apply(lambda x: 1 if x > 0 else 0)

print(f"There are {clean_data.shape[0]} roller coasters we have data for.")
print(f"There are {np.sum(clean_data['Inverted'])} inverted roller coasters.")
print(f"There are {clean_data.shape[0]-np.sum(clean_data['Inverted'])} non-inverted roller coasters.")

There are 70 roller coasters we have data for.
There are 36 inverted roller coasters.
There are 34 non-inverted roller coasters.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['Inverted'] = clean_data['Inversions_clean'].apply(lambda x: 1 if x > 0 else 0)


In [89]:
# Creating model and training

X_train = clean_data[['height_ft','speed_mph','Gforce_clean']].to_numpy()
y_train = clean_data['Inverted'].to_numpy()

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on training data
y_pred = model.predict(X_train)

w = model.weights
b = model.bias

print(w)

np.dot(X_train[0], w)


#print(X_train)

#print(X_train.mean(), X_train.std())

#print(y_train)




[-0.30526471 -0.33354276  1.24211777]


-40.226354570198495