In [6]:
# Import necessary libraries for machine learning tasks
from sklearn.linear_model import LogisticRegression  # Logistic Regression model, supports multinomial logistic (softmax) regression
from sklearn.model_selection import train_test_split  # Function to easily split data into training and testing sets
from sklearn.metrics import accuracy_score  # Function to calculate the accuracy of predictions
import pandas as pd  # Pandas library for data manipulation and analysis

In [7]:
# Load the dataset
hsb_df = pd.read_csv('hsbdemo.csv')  # Load data from CSV file into a pandas DataFrame

In [8]:
# Prepare the feature variables ('ses' and 'write') and target variable ('prog')
X = hsb_df[['ses', 'write']]  # Select 'ses' (socio-economic status) and 'write' (writing score) as features
y = hsb_df['prog']  # Select 'prog' (program type) as the target variable

In [9]:
# Data preprocessing
X['ses'].replace(['low', 'middle', 'high'], [0, 1, 2], inplace=True)  # Convert categorical 'ses' values to numerical
y.replace(['general', 'vocation', 'academic'], [0, 1, 2], inplace=True)  # Convert categorical 'prog' values to numerical

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ses'].replace(['low', 'middle', 'high'], [0, 1, 2], inplace=True)  # Convert categorical 'ses' values to numerical


In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)  # 70% training data, 30% testing data

In [11]:
# Model initialization and training
logmodel = LogisticRegression(multi_class='multinomial')  # Initialize the Logistic Regression model for multinomial classification
logmodel.fit(X_train, y_train)  # Fit the model to the training data

In [12]:
# Model evaluation
accuracy = accuracy_score(logmodel.predict(X_test), y_test)  # Predict on the test set and calculate accuracy
print(accuracy)  # Print the accuracy of the model

# This code demonstrates how to implement a Softmax Regression model (a generalization of logistic regression to support multi-class classification)
# using scikit-learn. The model is trained to predict the program type ('prog') of students based on their socio-economic status ('ses') and writing
# score ('write'). After converting categorical variables to numerical, the dataset is split into training and testing sets. The model is then trained
# on the training set and evaluated for its accuracy on the test set, providing insight into its performance in classifying the students into the
# correct program types.

0.65
