In [19]:
# Data handling and processing
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


# Sklearn imports
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import RFE
from sklearn import metrics

# Model imports
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# File mgmt
import os

os.chdir('Data')
!ls

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Data'

In [4]:
path = 'Transactions_Village114/ds3170_tx_All_Data_5208_2019_1211_082948.txt'
df = pd.read_csv(path, sep="\t")
cols = ['Anon Student Id', 'Session Id', 'Duration (sec)', 'Level (Tutor Name)', 'Level (Tutor)', 'Problem Name', 'Problem View','Step Name', 'Attempt At Step','Is Last Attempt','Outcome', 'Input','CF (File)','CF (Matrix)','CF (Matrix Level)', 'CF (Matrix Order)', 'CF (Total Activity Problems)']
df = df[cols]
bubble_pop_df = df[df['Level (Tutor Name)'] == 'bubble_pop']
bpop_math_df = bubble_pop_df[bubble_pop_df['CF (Matrix)'] == 'math'] #6172

# X and y -> inputs and outputs for the classification model
cols = ['Duration (sec)', 'Level (Tutor)', 'Attempt At Step', 'CF (Matrix Level)', 'CF (Matrix Order)', 'Outcome']
X = bpop_math_df[cols]
y = X[['Outcome']]
X = X.drop(['Outcome'], axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


## Helpful functions

In [9]:
# Clean duration (sec) columns: Convert all values to float, if not able to convert set value in that row as duration_mean
def clean_duration_col_with_mean(X):
    strings = 0
    ints = 0
    floats = 0
    other = 0
    cnt = 0
    sums = 0
    posn = []
    X_duration_arr = X['Duration (sec)'].array

    for i in range(len(X_duration_arr)):
        if isinstance(X_duration_arr[i], int):
            ints += 1
        elif isinstance(X_duration_arr[i], str):
            strings += 1
            try:
                X_duration_arr[i] = float(X_duration_arr[i])
            except ValueError:
#                 print("Cant convert to a float", X_duration_arr[i], "Position: ",i)
                posn.append(i)
        elif isinstance(X_duration_arr[i], float):
            floats += 1
    
        if isinstance(X_duration_arr[i], float):
            sums += X_duration_arr[i]
            cnt += 1

    duration_mean = sums/cnt
    for i in range(len(posn)):
        X_duration_arr[posn[i]] = duration_mean

    for i in range(len(X_duration_arr)):
        col = bpop_math_df.columns.get_loc('Duration (sec)')
        val = X_duration_arr[i]
        X.iloc[i, col] = val
    
    return X

# Ordinal encodes a column
def ordinal_encode_col(X, colname):
    enc = OrdinalEncoder()
    level_tutor = X[colname].tolist()
    tutor_level_2dlist = []
    for i in range(len(level_tutor)):
        tutor_level_2dlist.append([level_tutor[i]])

    enc.fit(tutor_level_2dlist)

    col = X.columns.get_loc(colname)
    num_entries = X.count()[colname]
    for i in range(num_entries):
        val = X.iloc[i, col]
        val = enc.transform([[val]])[0][0]
        X.iloc[i, col] = val
    
    return X

def rfe(X_train, y_train, apply_rfe):
    if apply_rfe == False:
        return
    data_final_vars=X_train.columns.values.tolist()
    y_=['y']
    X=[i for i in data_final_vars if i not in y_]
    logreg = LogisticRegression()
    rfe = RFE(logreg, 20)
    rfe = rfe.fit(X_train, y_train)
#     print("RFE Support: ",rfe.support_)
#     print("RFE Ranking: ", rfe.ranking_)

# incorrect -> 0, correct -> 1
# set_zero = "INCORRECT" means incorrect will be set to 0 and other value as 1
def encode_outputs(y, colname, set_zero):
    col = y.columns.get_loc(colname)
    for i in range(y.count()[colname]): 
        val = y.iloc[i, col]
        if val == set_zero:
            val = 0
        else: 
            val = 1
        y.iloc[i, col] = val
    
    return y

def encode_categorical_cols(X, encoder, col):

#   Ordinally encode categorical columns
    if encoder == "ordinal":
        for i in range(len(col)):
            X = ordinal_encode_col(X, col[i])
    
    return X

def normalize_data(X, norm):
    if norm == 'l1':
        X = sklearn.preprocessing.normalize(X, norm='l1', axis=1, copy=True, return_norm=False)
        X = pd.DataFrame(data=X, columns=cols[:len(cols)-1])
    
    elif norm == 'l2':
        X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True, return_norm=False)
        X = pd.DataFrame(data=X, columns=cols[:len(cols)-1])
    
    return X


def run_logistic_regression(X, y, test_split, apply_rfe):
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
    rfe(X_train, y_train, apply_rfe)
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    performance = 100 * logreg.score(X_test, y_test)
#     print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(performance))
    return performance
    
def run_k_neighbors_classifier(X, y, test_split, apply_rfe, neighbors):
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
    rfe(X_train, y_train, apply_rfe)       
    neigh = KNeighborsClassifier(n_neighbors=neighbors)
    neigh.fit(X_train, y_train)
    performance = 100 * neigh.score(X_test, y_test)
#     print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(performance))
    return performance

def run_svm(X, y, test_split, apply_rfe, svc_kernel):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
    rfe(X_train, y_train, apply_rfe)   
    svm_model = svm.SVC(kernel=svc_kernel)
    print("CREATE MODEL")
    svm_model.fit(X_train, y_train)
    print("FIT")
    #Predict the response for test dataset
    y_pred = svm_model.predict(X_test)
    perf = 100 * metrics.accuracy_score(y_test, y_pred)
    print("Accuracy:",perf)
    return perf

# def run_nn(X, y test_split):
    
    
    
def run_algo(algo, X, y, clean_method, encoder, norm, test_split, apply_rfe, categorical_cols, neighbors, svc_kernel):
    
    if clean_method == "mean":
        X = clean_duration_col_with_mean(X)
    
    X = encode_categorical_cols(X, encoder, categorical_cols)
    X = normalize_data(X, norm)
    y = encode_outputs(y, 'Outcome', 'INCORRECT').values.ravel()
    perf = 0
    if algo == "logistic_regression":
        perf = run_logistic_regression(X, y, test_split, apply_rfe)
    
    elif algo == "k_neighbors_classifier":
        perf = run_k_neighbors_classifier(X, y, test_split, apply_rfe, neighbors)
    
    elif algo == "svm":
        perf = run_svm(X, y, test_split, apply_rfe, svc_kernel)
        
    elif algo == "nn":
        perf = run_nn(X, y, test_split)
      
    return perf
    

In [None]:
# mean median or mode (#FIXME: only supports mean for now)
clean_method = "mean" 

# ordinal, onehot etc.. (#FIXME: only supports ordinal encoding for now)
encoder = "ordinal" 

# cols that are categories rather than numbers and have to be encoded
categorical_cols = ['Level (Tutor)']

# none, l1, l2 etc..
norm = "none" 

# set True to apply Recursive Feature Elimination
apply_rfe = True

test_split = 0.2
num_runs = 10

# k neighbors classifier
neighbors = 5

# linear, poly, rbf, sigmoid
svc_kernel = 'linear'

# Supports logistic_regression, k_neighbors_classifier, svm, nn
algo = "nn"

# used to store performance of each run in num_runs
total_perfs = []

import time
start = time.time()

for i in range(num_runs):
    X = bpop_math_df[cols]
    X = X.sample(frac=1).reset_index(drop=True)
    y = X[['Outcome']]
    X = X.drop(['Outcome'], axis=1)
    
    perf = run_algo(algo, X, y, clean_method, encoder, norm, test_split, apply_rfe, categorical_cols, neighbors, svc_kernel)
    total_perfs.append(perf)

end = time.time()
print(end - start)

avg_perf = sum(total_perfs)/num_runs
print("Avg. performance over ", num_runs, "runs: ", avg_perf)

CREATE MODEL


In [None]:
print(avg_perf, "%(", min(total_perfs), "-", max(total_perfs) ,")")

In [1]:
import torch 
import torch.nn as nn

NameError: name 'X' is not defined

In [78]:
# output = model(X_tensor)
# type(X_tensor[0][0].item())
# a = torch.rand((1, 5))
# output = model(a)
# a.size()
# X_tensor.size()
X_tensor = X_tensor.view(-1, 5)
output = model(X_tensor)

RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #2 'mat1' in call to _th_addmm

In [12]:
X = clean_duration_col_with_mean(X)
X = encode_categorical_cols(X, 'ordinal', ['Level (Tutor)'])
y = encode_outputs(y, 'Outcome', 'INCORRECT').values.ravel()

In [14]:
batch_size = 60
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train_loader = torch.utils.data.DataLoader(dataset=X_train, batch_size=batch_size, shuffle=True)
X_test_loader = torch.utils.data.DataLoader(dataset=X_test, batch_size=batch_size, shuffle=True)

y_train_loader = torch.utils.data.DataLoader(dataset=y_train, batch_size=batch_size, shuffle=True)
y_test_loader = torch.utils.data.DataLoader(dataset=y_test, batch_size=batch_size, shuffle=True)

y_train_loader

<torch.utils.data.dataloader.DataLoader at 0x219a2ac54a8>

In [17]:
X_train['Outcome'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
y_train

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

In [18]:
trainset = X_train
testset = X_test

X_train['Outcome'] = y_train
X_yest['Outcome'] = y_test

Unnamed: 0,Duration (sec),Level (Tutor),Attempt At Step,CF (Matrix Level),CF (Matrix Order),Outcome
12355,6,29,6.0,9,291,0
41803,7,12,7.0,4,104,1
123820,5,10,5.0,4,107,0
142498,8,80,8.0,2,74,1
82911,5,37,5.0,4,109,0
...,...,...,...,...,...,...
82309,7,36,7.0,4,110,1
56273,7,36,7.0,4,110,1
12018,12,44,12.0,9,287,0
42336,16,13,16.0,4,103,1


In [51]:
X['Duration (sec)'] = X['Duration (sec)'].astype(float)
X['Level (Tutor)'] = X['Level (Tutor)'].astype(float)
X['CF (Matrix Level)'] = X['CF (Matrix Level)'].astype(float)
X['CF (Matrix Order)'] = X['CF (Matrix Order)'].astype(float)

StudentNet(
  (fc1): Linear(in_features=5, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=1, bias=True)
)


In [88]:
X_train['Duration (sec)'] = X_train['Duration (sec)'].astype(float)
X_train['Level (Tutor)'] = X_train['Level (Tutor)'].astype(float)
X_train['CF (Matrix Level)'] = X_train['CF (Matrix Level)'].astype(float)
X_train['CF (Matrix Order)'] = X_train['CF (Matrix Order)'].astype(float)

X_test['Duration (sec)'] = X_test['Duration (sec)'].astype(float)
X_test['Level (Tutor)'] = X_test['Level (Tutor)'].astype(float)
X_test['CF (Matrix Level)'] = X_test['CF (Matrix Level)'].astype(float)
X_test['CF (Matrix Order)'] = X_test['CF (Matrix Order)'].astype(float)

X_test = X_test.drop(columns=['Outcome'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-d

KeyError: "['Outcome'] not found in axis"

In [84]:
torch.tensor(X_train.values)

tensor([[  6.,  29.,   6.,   9., 291.,   0.],
        [  7.,  12.,   7.,   4., 104.,   1.],
        [  5.,  10.,   5.,   4., 107.,   0.],
        ...,
        [ 12.,  44.,  12.,   9., 287.,   0.],
        [ 16.,  13.,  16.,   4., 103.,   1.],
        [  6.,  17.,   6.,   4., 116.,   1.]], dtype=torch.float64)

In [86]:
X_train = X_train.drop(columns=['Outcome'])
X_train

Unnamed: 0,Duration (sec),Level (Tutor),Attempt At Step,CF (Matrix Level),CF (Matrix Order)
12355,6.0,29.0,6.0,9.0,291.0
41803,7.0,12.0,7.0,4.0,104.0
123820,5.0,10.0,5.0,4.0,107.0
142498,8.0,80.0,8.0,2.0,74.0
82911,5.0,37.0,5.0,4.0,109.0
...,...,...,...,...,...
82309,7.0,36.0,7.0,4.0,110.0
56273,7.0,36.0,7.0,4.0,110.0
12018,12.0,44.0,12.0,9.0,287.0
42336,16.0,13.0,16.0,4.0,103.0


In [90]:
y_train

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

In [93]:
len(X_train)
X_train

Unnamed: 0,Duration (sec),Level (Tutor),Attempt At Step,CF (Matrix Level),CF (Matrix Order)
12355,6.0,29.0,6.0,9.0,291.0
41803,7.0,12.0,7.0,4.0,104.0
123820,5.0,10.0,5.0,4.0,107.0
142498,8.0,80.0,8.0,2.0,74.0
82911,5.0,37.0,5.0,4.0,109.0
...,...,...,...,...,...
82309,7.0,36.0,7.0,4.0,110.0
56273,7.0,36.0,7.0,4.0,110.0
12018,12.0,44.0,12.0,9.0,287.0
42336,16.0,13.0,16.0,4.0,103.0


In [107]:
y_train = torch.tensor(y_train)

In [180]:
class StudentNet(nn.Module):
    
    def __init__(self, input_size, hidden1_size, hidden2_size, num_classes):
        
        super(StudentNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.fc3 = nn.Linear(hidden2_size, 20)
        self.fc4 = nn.Linear(20, num_classes)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)

model = StudentNet(5, 10, 10, 1)
print(model)

import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.03)
loss_function = nn.MSELoss()

batch_size = 60
epochs = 100

for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        
        batch_X = X_train[i: i+batch_size].float()
        batch_y = y_train[i:i+batch_size].float()
        
        model.zero_grad()
        output = model(batch_X)
        loss = loss_function(output, batch_y)
        loss.backward()
        optimizer.step()
    
    if epoch % 100 == 0:
        print(loss)
print(loss)

StudentNet(
  (fc1): Linear(in_features=5, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=20, bias=True)
  (fc4): Linear(in_features=20, out_features=1, bias=True)
)
tensor(0.7059, grad_fn=<MseLossBackward>)
tensor(0.7059, grad_fn=<MseLossBackward>)


In [175]:
with torch.no_grad():
    for i in range(len(X_test)):
        X_test = X_test.float()
        net_out = model(X_test[i].view(-1, 5))[0][0].item()
        print(net_out)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [167]:
X_train

tensor([[  6.,  29.,   6.,   9., 291.],
        [  7.,  12.,   7.,   4., 104.],
        [  5.,  10.,   5.,   4., 107.],
        ...,
        [ 12.,  44.,  12.,   9., 287.],
        [ 16.,  13.,  16.,   4., 103.],
        [  6.,  17.,   6.,   4., 116.]], dtype=torch.float64)

In [169]:
X_test = torch.tensor(X_test.values)

In [170]:
X_test

tensor([[  7.,  36.,   7.,   4., 110.],
        [ 15.,  76.,  15.,   2.,  85.],
        [  6.,  35.,   6.,   4., 112.],
        ...,
        [  5.,  66.,   5.,   2.,  51.],
        [  9.,  27.,   9.,   9., 295.],
        [  7.,  12.,   7.,   4., 104.]], dtype=torch.float64)