In [36]:
import pandas as pd
import numpy as np
import networkx as nx

In [37]:
# load state data
state_df = pd.read_csv('../data/states.csv')

# drop all entries of Year_Semester including the word "Summer"
state_df = state_df[~state_df['Year_Semester'].str.contains('Summer')]

state_df

Unnamed: 0,id,Semester_Num,CSCI128,CSCI200,CSCI220,CSCI274,CSCI306,CSCI341,CSCI358,CSCI370,CSCI400,CSCI406,CSCI442,MATH111,MATH112,MATH213,MATH332,Year_Semester
0,3,10094,0,0,1,1,1,1,1,1,1,3,3,0,1,1,1,Fall 2018
1,3,10095,-1,-1,0,0,0,1,1,1,1,2,1,0,1,1,1,Spring 2019
3,3,10099,-1,-1,0,0,-1,0,1,-1,-1,2,0,0,1,1,0,Fall 2019
4,3,10100,-1,-1,0,0,-1,0,0,-1,-2,0,-1,0,1,1,0,Spring 2020
6,3,10104,-1,-1,0,0,-1,0,0,-1,-2,0,-1,0,1,1,0,Fall 2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84834,2186,10120,0,0,1,1,1,1,1,1,1,3,3,0,1,1,1,Spring 2024
84836,6481,10114,0,1,1,1,1,1,1,1,2,3,3,0,1,1,1,Fall 2022
84837,6481,10115,0,1,1,1,1,1,1,1,2,3,3,0,1,1,1,Spring 2023
84839,6481,10119,0,1,1,1,1,1,1,1,2,3,3,0,1,1,1,Fall 2023


In [38]:
print(state_df.columns)

Index(['id', 'Semester_Num', 'CSCI128', 'CSCI200', 'CSCI220', 'CSCI274',
       'CSCI306', 'CSCI341', 'CSCI358', 'CSCI370', 'CSCI400', 'CSCI406',
       'CSCI442', 'MATH111', 'MATH112', 'MATH213', 'MATH332', 'Year_Semester'],
      dtype='object')


In [39]:
classes = ['CSCI128', 'CSCI200', 'CSCI220', 'CSCI274',
       'CSCI306', 'CSCI341', 'CSCI358', 'CSCI370', 'CSCI400', 'CSCI406',
       'CSCI442', 'MATH111', 'MATH112', 'MATH213', 'MATH332']

# NOTE: this changes eligible = 1, taken = 0
# add one to all entries in classes
for c in classes: 
    state_df[c] = state_df[c] + 1

In [40]:
# create column concatenating all columns in classes
state_df['signature'] = state_df[classes].apply(lambda x: ''.join(x.astype(str)), axis=1)

In [41]:
state_df

Unnamed: 0,id,Semester_Num,CSCI128,CSCI200,CSCI220,CSCI274,CSCI306,CSCI341,CSCI358,CSCI370,CSCI400,CSCI406,CSCI442,MATH111,MATH112,MATH213,MATH332,Year_Semester,signature
0,3,10094,1,1,2,2,2,2,2,2,2,4,4,1,2,2,2,Fall 2018,112222222441222
1,3,10095,0,0,1,1,1,2,2,2,2,3,2,1,2,2,2,Spring 2019,001112222321222
3,3,10099,0,0,1,1,0,1,2,0,0,3,1,1,2,2,1,Fall 2019,001101200311221
4,3,10100,0,0,1,1,0,1,1,0,-1,1,0,1,2,2,1,Spring 2020,00110110-1101221
6,3,10104,0,0,1,1,0,1,1,0,-1,1,0,1,2,2,1,Fall 2020,00110110-1101221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84834,2186,10120,1,1,2,2,2,2,2,2,2,4,4,1,2,2,2,Spring 2024,112222222441222
84836,6481,10114,1,2,2,2,2,2,2,2,3,4,4,1,2,2,2,Fall 2022,122222223441222
84837,6481,10115,1,2,2,2,2,2,2,2,3,4,4,1,2,2,2,Spring 2023,122222223441222
84839,6481,10119,1,2,2,2,2,2,2,2,3,4,4,1,2,2,2,Fall 2023,122222223441222


In [42]:
# get list of all signatures
signatures = state_df['signature'].unique()

# create graph using networkx with the signatures as nodes
G = nx.DiGraph()
G.add_nodes_from(signatures)

# add 0 weight edges between nodes
for s1 in signatures:
    for s2 in signatures:
        G.add_edge(s1, s2, weight=0)
        G.add_edge(s2, s1, weight=0)

In [44]:
for id in state_df['id'].unique(): 
    id_df = state_df[state_df['id'] == id]

    # get list in order of signatures
    sigs = id_df['signature'].tolist()

    # add 1 to weight of edge between each pair of signatures
    for i in range(len(sigs) - 1): 
        G[sigs[i]][sigs[i+1]]['weight'] += 1
    

In [45]:
# based on graph, create Markov chain matrix
weighted_adj_matrix = nx.to_numpy_array(G, weight='weight')

# Normalize the weighted adjacency matrix
row_sums = weighted_adj_matrix.sum(axis=1, keepdims=True)
# Avoid division by zero for rows that sum to zero
row_sums[row_sums == 0] = 1
normalized_adj_matrix = weighted_adj_matrix / row_sums

In [46]:
print(len(normalized_adj_matrix))

1992


# Calculating State Changes

In [50]:
spring23_df = state_df[state_df['Year_Semester'] == 'Spring 2023']

# reduce spring23_df to include counts of signatures
spring23_df = spring23_df.groupby('signature').size().reset_index(name='count')

# Ensure the DataFrame is ordered to match the Markov matrix
df_ordered = spring23_df.set_index('signature').reindex(signatures).reset_index()

# Now, df_ordered['count'] can be directly used as the state vector
state_vector = df_ordered['count'].fillna(0).to_numpy()

[32.  0.  0. ... 11.  2.  0.]
['112222222441222' '001112222321222' '001101200311221' ...
 '112222222341222' '112222222321222' '122222223431222']


Unnamed: 0,signature,count
0,112222222441222,32.0
1,001112222321222,
2,001101200311221,
3,00110110-1101221,1.0
4,10000001-1110000,1.0
...,...,...
1987,122222222441222,1.0
1988,122222222331222,1.0
1989,112222222341222,11.0
1990,112222222321222,2.0


In [54]:
# Run Markov chain for 1 step 
predicted_fall23 = state_vector @ normalized_adj_matrix

# compare to ground truth
fall23 = state_df[state_df['Year_Semester'] == 'Fall 2023']

# reduce fall23 to include counts of signatures
fall23_df = fall23.groupby('signature').size().reset_index(name='count')

# Ensure the DataFrame is ordered to match the Markov matrix
fall23_df_ordered = fall23_df.set_index('signature').reindex(signatures).reset_index()
true_fall23 = fall23_df_ordered['count'].fillna(0).to_numpy()

# calculate error between prediction and truth 
error = np.linalg.norm(predicted_fall23 - true_fall23)

print(error)
print(sum(predicted_fall23))
print(sum(true_fall23))

456.2979761148927
6414.999999999993
7332.0


States need to be named by class for interpretability. 

Here's what it needs to be formatted like: 
- Each state is still a signature of the classes
- Trinary indicators: Taken, Eligible, Non-eligble
- When a signature is predicted, we can compare the predicted and ground truth like so: 
    - Amount of students that took a class follows the basis of (# of past vector taken for a class) v. (# of predicted vector for a class). The change in the amount of students who took a class is the enrollment
    - The true amount can be calculated as the difference between past and truth, while the predicted amount can be calculated as diffrence between past and predicted

- Those numbers can be calculated by looking at the number of signatures in which a class is marked as taken. We know that the Markov chain cannot "untake" a class because no such state transitions exist. Thus, any increases in the amount of taken will be representative of teh enrollment. 