## IMPORT LIBRARY

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sparse
from sklearn.preprocessing import OneHotEncoder

## IMPORT DATA


In [30]:
data = pd.read_csv('./students.csv')
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nacionality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

## PREPROCESS DATA

In [31]:
print(data['Target'].unique())

['Dropout' 'Graduate' 'Enrolled']


In [32]:
data['Target']=data['Target'].map({'Dropout':0,'Graduate':1,'Enrolled':2 })

In [33]:
x=data.drop('Target',axis=1)
y=data['Target']


In [34]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x.values, y, test_size=.30,stratify=data['Target'],random_state=1)

## DEFIND MODEL

In [35]:
def augment_feature_vector(X):
    return np.hstack((np.ones([len(X),1]),X))

In [36]:
def vectorized_compute_probabilities(X,theta):
    #Arguements:
    #X is (nxd) matrix (numpy array)
    #theta is a kxd matrix

    #Returns:
    #H - a (kxn) matrix (numpy array) such that each column of H represents the probabilities that the ith 
    #data point takes on each label

    theta_XT = np.matmul(theta, np.transpose(X))
    #taking a columnwise max:
    c = np.amax(theta_XT,axis=0)
    #elementwise exponentation of theta_XT:
    exp_matrix = np.exp(theta_XT-c)
    #computing the normalization factors for each column of H:
    sum_vector = np.sum(exp_matrix,axis=0)

    #broadcasting
    return exp_matrix/sum_vector

In [37]:
def gradient_descent_iteration(X, Y, theta, alpha, lambda_factor):
    
    n = len(Y)
    k = theta.shape[0]
    data = [1]*n
    
    H = vectorized_compute_probabilities(X, theta)
    #more efficient way to implement large sparse arrays:
    M = sparse.coo_matrix((data, (Y, range(n))), shape=(k,n)).toarray()
    first_term = np.matmul(M-H, X)*(-1/n)
    second_term = lambda_factor * theta

    return theta - alpha * (first_term + second_term)

In [38]:
def predict(X, theta):

    X = augment_feature_vector(X)
    probabilities = vectorized_compute_probabilities(X, theta)
    print('np.argmax(probabilities,axis=1).shape: ',np.argmax(probabilities, axis = 0).shape)
    return np.argmax(probabilities, axis = 0)

In [39]:
def compute_accuracy(X, Y, theta):
    predictions = predict(X, theta)
    return np.mean(predictions == Y)

In [40]:
def softmax_regression(X, Y, alpha, lambda_factor, k, num_iterations):   
    
    X = augment_feature_vector(X)
    theta = np.zeros([k, X.shape[1]])
    for i in range(num_iterations):
        theta = gradient_descent_iteration(X, Y, theta, alpha, lambda_factor)
    
    return theta

In [41]:
#using parameters suggested in prompt for original project but extending number of iterations of gradient
#descent to 1000 instead of 150
theta_final = softmax_regression(x_train, y_train, alpha = .3, lambda_factor = 1.0e-4, k = 3, num_iterations = 1000)

In [42]:
compute_accuracy(x_test, y_test, theta_final)

np.argmax(probabilities,axis=1).shape:  (1328,)


0.6430722891566265

In [44]:
X=augment_feature_vector(x_train)
theta = np.zeros([3, X.shape[1]])
H = vectorized_compute_probabilities(X, theta)
M = sparse.coo_matrix(([1]*len(y_train), (y_train, range(len(y_train)))), shape=(3,len(y_train))).toarray()