In [None]:
import pandas as pd

# Load the training and test datasets
train_data_path = './processed_train.csv'
test_data_path = './processed_test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# Display the first few rows of the datasets for initial inspection
train_df.head(), test_df.head()

In [None]:
# Checking for missing values and data types in the datasets
missing_values_train = train_df.isnull().sum()
missing_values_test = test_df.isnull().sum()
data_types_train = train_df.dtypes
data_types_test = test_df.dtypes

missing_values_train, missing_values_test, data_types_train, data_types_test

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the correlation matrix
corr_matrix = train_df.corr()

# Plotting the correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Features")
plt.show()

# Displaying the correlation of each feature with the target variable 'Transported'
corr_with_target = corr_matrix['Transported'].sort_values(ascending=False)
corr_with_target

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Splitting the training data into features and target variable
X_train = train_df.drop('Transported', axis=1)
y_train = train_df['Transported']

# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Decision Tree Classifier Model
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)

# Preparing the test data (excluding the target variable)
X_test = test_df.drop('Transported', axis=1, errors='ignore')  # 'errors=ignore' in case 'Transported' is not in test data

# Predictions
log_reg_pred = log_reg.predict(X_test)
dec_tree_pred = dec_tree.predict(X_test)

# Since we don't have the true labels for the test set, we'll evaluate on a split of the training set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Evaluate on the validation set
log_reg_val_pred = log_reg.predict(X_val_split)
dec_tree_val_pred = dec_tree.predict(X_val_split)

# Evaluating the performance
log_reg_accuracy = accuracy_score(y_val_split, log_reg_val_pred)
dec_tree_accuracy = accuracy_score(y_val_split, dec_tree_val_pred)

log_reg_report = classification_report(y_val_split, log_reg_val_pred)
dec_tree_report = classification_report(y_val_split, dec_tree_val_pred)

# log_reg_accuracy, dec_tree_accuracy, log_reg_report, dec_tree_report
print("Logistic Regression Model Evaluation:")
print("--------------------------------------")
print(f"Accuracy: {log_reg_accuracy:.2f}")
print("Classification Report:")
print(log_reg_report)

print("\nDecision Tree Classifier Model Evaluation:")
print("-------------------------------------------")
print(f"Accuracy: {dec_tree_accuracy:.2f}")
print("Classification Report:")
print(dec_tree_report)

In [None]:
# Selecting features with higher correlation
selected_features = ['CryoSleep', 'RoomService', 'Spa', 'VRDeck']

# Creating new feature sets based on selected features
X_train_selected = X_train[selected_features]
X_val_split_selected = X_val_split[selected_features]

# Logistic Regression Model with selected features
log_reg_selected = LogisticRegression()
log_reg_selected.fit(X_train_selected, y_train)

# Decision Tree Classifier Model with selected features
dec_tree_selected = DecisionTreeClassifier()
dec_tree_selected.fit(X_train_selected, y_train)

# Evaluating the performance on the validation set with selected features
log_reg_val_pred_selected = log_reg_selected.predict(X_val_split_selected)
dec_tree_val_pred_selected = dec_tree_selected.predict(X_val_split_selected)

# Calculating accuracy
log_reg_accuracy_selected = accuracy_score(y_val_split, log_reg_val_pred_selected)
dec_tree_accuracy_selected = accuracy_score(y_val_split, dec_tree_val_pred_selected)

log_reg_accuracy_selected, dec_tree_accuracy_selected

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Creating models for different classification methods
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naive Bayes": GaussianNB()
}

# Dictionary to store the accuracy of each model
accuracy_scores = {}

# Splitting the training data into features and target variable
X_train = train_df.drop('Transported', axis=1)
y_train = train_df['Transported']

# Since we don't have the true labels for the test set, we'll evaluate on a split of the training set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Training and evaluating each model
for name, model in models.items():
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val_split)
    accuracy = accuracy_score(y_val_split, val_pred)
    accuracy_scores[name] = accuracy
    print(f"{name} Model Evaluation:")
    print(f"Accuracy: {accuracy:.2f}")
    print("-------------------------------------")

# Comparing the results
accuracy_scores

In [None]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
import seaborn as sns
import numpy as np

# 手写逻辑回归
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000, l2_reg=0):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.l2_reg = l2_reg  # L2 regularization term
        self.weights = None
        self.bias = None
        self.losses = []  # To record the loss during training
        self.accuracies = []  # To record the accuracy during training

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def _compute_loss(self, y, predictions):
        # Regularization term
        reg_term = (self.l2_reg / 2) * np.sum(np.square(self.weights))
        return -np.mean(y * np.log(predictions) + (1 - y) * np.log(1 - predictions)) + reg_term

    def fit(self, X, y, X_val=None, y_val=None):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient Descent
        for _ in range(self.n_iterations):
            model = np.dot(X, self.weights) + self.bias
            predictions = self._sigmoid(model)

            # Compute loss
            loss = self._compute_loss(y, predictions)
            self.losses.append(loss)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (predictions - y)) + (self.l2_reg * self.weights)
            db = (1 / n_samples) * np.sum(predictions - y)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            # Validation accuracy
            if X_val is not None and y_val is not None:
                val_pred = self.predict(X_val)
                val_accuracy = accuracy_score(y_val, val_pred)
                self.accuracies.append(val_accuracy)

    def predict_proba(self, X):
        model = np.dot(X, self.weights) + self.bias
        predictions = self._sigmoid(model)
        return predictions

    def predict(self, X):
        probabilities = self.predict_proba(X)
        return np.array([1 if i > 0.5 else 0 for i in probabilities])

    def plot_losses(self):
        plt.plot(self.losses, label="Loss")
        plt.title("Loss during Training")
        plt.xlabel("Iteration")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()

    def plot_accuracies(self):
        plt.plot(self.accuracies, label="Accuracy")
        plt.title("Accuracy during Training")
        plt.xlabel("Iteration")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.show()

    def plot_probability_distribution(self, X, y):
        probabilities = self.predict_proba(X)
        plt.hist(probabilities, bins=10, alpha=0.7, label='Predicted Probabilities')
        plt.title("Distribution of Predicted Probabilities")
        plt.xlabel("Probability")
        plt.ylabel("Frequency")
        plt.legend()
        plt.show()

    def plot_confusion_matrix(self, X, y):
        predictions = self.predict(X)
        cm = confusion_matrix(y, predictions)
        sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
        plt.title("Confusion Matrix")
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.show()

# Creating an instance of the enhanced Logistic Regression model
lr_model = LogisticRegression(learning_rate=0.01, n_iterations=1000, l2_reg=0.1)

# Training the model with validation data for accuracy tracking
lr_model.fit(X_train_split.values, y_train_split.values, X_val_split.values, y_val_split.values)

# Plotting the training losses and accuracies
lr_model.plot_losses()
lr_model.plot_accuracies()

# Plotting the probability distribution and confusion matrix
lr_model.plot_probability_distribution(X_val_split.values, y_val_split.values)
lr_model.plot_confusion_matrix(X_val_split.values, y_val_split.values)

# Predicting and evaluating on the validation set
lr_predictions = lr_model.predict(X_val_split.values)
lr_accuracy = accuracy_score(y_val_split, lr_predictions)
lr_accuracy

In [None]:
# 手写决策树 这段代码的运行可能需要 3 - 5 min 的时间
class Question:
    """A Question is used to partition a dataset. This class just records a 'column number' (e.g., 0 for the first column) and a 'column value' (e.g., Green). The 'match' method is used to compare the feature value in an example to the feature value stored in the question."""
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        # Compare the feature value in an example to the feature value in this question.
        val = example[self.column]
        if isinstance(val, int) or isinstance(val, float):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print the question in a readable format.
        condition = "=="
        if isinstance(self.value, int) or isinstance(self.value, float):
            condition = ">="
        return f"Is {self.column} {condition} {str(self.value)}?"

class DecisionNode:
    """A Decision Node asks a question. This holds a reference to the question, and to the two child nodes."""
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

class Leaf:
    """A Leaf node classifies data. This holds a dictionary of class (e.g., "Apple") -> number of times it appears in the rows from the training data that reach this leaf."""
    def __init__(self, rows):
        self.predictions = self.class_counts(rows)

    @staticmethod
    def class_counts(rows):
        """Counts the number of each type of example in a dataset."""
        counts = {}  # a dictionary of label -> count.
        for row in rows:
            # in our dataset format, the label is always the last column
            label = row[-1]
            if label not in counts:
                counts[label] = 0
            counts[label] += 1
        return counts

class DecisionTreeClassifierFromScratch:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.root = None

    @staticmethod
    def gini(rows):
        """Calculate the Gini Impurity for a list of rows."""
        counts = Leaf.class_counts(rows)
        impurity = 1
        for lbl in counts:
            prob_of_lbl = counts[lbl] / float(len(rows))
            impurity -= prob_of_lbl**2
        return impurity

    @staticmethod
    def info_gain(left, right, current_uncertainty):
        """Information Gain. The uncertainty of the starting node, minus the weighted impurity of two child nodes."""
        p = float(len(left)) / (len(left) + len(right))
        return current_uncertainty - p * DecisionTreeClassifierFromScratch.gini(left) - (1 - p) * DecisionTreeClassifierFromScratch.gini(right)

    def find_best_split(self, rows):
        """Find the best question to ask by iterating over every feature / value and calculating the information gain."""
        best_gain = 0  # keep track of the best information gain
        best_question = None  # keep track of the feature / value that produced it
        current_uncertainty = self.gini(rows)
        n_features = len(rows[0]) - 1  # number of columns

        for col in range(n_features):  # for each feature
            print(col)
            values = set([row[col] for row in rows])  # unique values in the column

            for val in values:  # for each value
                question = Question(col, val)

                # try splitting the dataset
                true_rows, false_rows = self.partition(rows, question)

                # Skip this split if it doesn't divide the dataset.
                if len(true_rows) == 0 or len(false_rows) == 0:
                    continue

                # Calculate the information gain from this split
                gain = self.info_gain(true_rows, false_rows, current_uncertainty)

                if gain >= best_gain:
                    best_gain, best_question = gain, question

        return best_gain, best_question

    @staticmethod
    def partition(rows, question):
        """Partitions a dataset. For each row in the dataset, check if it matches the question. If so, add it to 'true rows', otherwise, add it to 'false rows'."""
        true_rows, false_rows = [], []
        for row in rows:
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows

    def build_tree(self, rows, depth=0):
        """Builds the tree."""
        # Checking if the depth limit is reached
        if depth >= self.max_depth:
            return Leaf(rows)

        gain, question = self.find_best_split(rows)

        # Base case: no further info gain
        if gain == 0:
            return Leaf(rows)

        true_rows, false_rows = self.partition(rows, question)

        # Recursively build the true branch
        true_branch = self.build_tree(true_rows, depth + 1)

        # Recursively build the false branch
        false_branch = self.build_tree(false_rows, depth + 1)

        # Return a Question node
        return DecisionNode(question, true_branch, false_branch)

    def fit(self, X, y):
        """Fits the model to the data."""
        rows = np.c_[X, y]
        self.root = self.build_tree(rows)

    def predict_row(self, node, row):
        """Predicts the label for a single row of data."""
        if isinstance(node, Leaf):
            return max(node.predictions, key=node.predictions.get)

        if node.question.match(row):
            return self.predict_row(node.true_branch, row)
        else:
            return self.predict_row(node.false_branch, row)

    def predict(self, X):
        """Predicts labels for each row in X."""
        return [self.predict_row(self.root, row) for row in X]

# Now, we can create an instance with a specified max_depth
dtc_model = DecisionTreeClassifierFromScratch(max_depth=5)

# Training the model
dtc_model.fit(X_train_split.values, y_train_split.values)

# Predicting on the validation set
dtc_predictions = dtc_model.predict(X_val_split.values)

# Calculating the accuracy
dtc_accuracy = accuracy_score(y_val_split, dtc_predictions)
dtc_accuracy