Author: José A Ruipérez-Valiente

In [19]:
# Load libraries
import csv
import pandas as pd
import numpy as np
import pdb
import copy
import scipy.spatial.distance
import datetime
import gc
from scipy.stats import ttest_ind

In [20]:
# PARAMETERS

# Posible distances: mean_absolute_difference; mean_squared_differences; absolute_difference; squared_differences; cosine; euclidean
distance_metric = 'mean_absolute_difference'

# Mean percentage of items in common between two accounts to be kept in the analysis, otherwise it will return a NaN for the couple
minPercentageCommonItems = 80

# Min percetange of items submitted for an account to keep it within the analysis
minPercentageItemsSubmitted = 50

# Names of columns (assignments), to be kept in the analysis
selectedItems = ['assignment_1', 'assignment_2', 'assignment_3', 'assignment_4', 'assignment_5']

# Time threshold in minutes for close submitters
timeThreshold = 30

In [21]:
submission_matrix = pd.read_csv("example_submission_matrix.csv", index_col=0)

In [22]:
submission_matrix

Unnamed: 0,assignment_1,assignment_2,assignment_3,assignment_4,assignment_5
1,2014-06-21T07:53:23.762423+00:00,2014-06-21T08:04:24.962730+00:00,2014-06-21T08:06:12.558066+00:00,2014-06-21T08:04:24.962730+00:00,2014-06-21T08:21:58.590764+00:00
2,2014-05-31T07:59:57.231538+00:00,2014-05-31T08:00:35.057636+00:00,2014-05-31T08:00:35.057636+00:00,2014-05-31T08:00:35.057636+00:00,2014-05-31T08:01:58.247444+00:00
3,2014-05-30T07:50:41.822814+00:00,2014-05-30T07:58:13.736850+00:00,2014-05-30T07:58:13.736850+00:00,2014-05-30T07:55:19.917861+00:00,2014-05-30T16:05:37.965483+00:00
4,,,,,
5,2014-09-05T13:03:14.540994+00:00,,,,2014-09-05T13:05:54.874886+00:00
6,2014-05-30T02:11:59.165730+00:00,2014-05-30T02:13:25.718455+00:00,2014-05-30T02:15:58.231386+00:00,2014-05-30T02:17:31.262157+00:00,2014-05-30T02:23:31.771410+00:00
7,2014-07-25T09:31:45.709167+00:00,2014-07-25T09:31:21.654261+00:00,2014-07-25T09:31:21.654261+00:00,2014-07-25T09:31:21.654261+00:00,
8,,,,,
9,,,,,
10,,,,,


In [23]:
# These lines could be used to use only a selection of assignments, if none, we use all columns

colsToKeep = []
"""
for col in submission_matrix.columns:
    colsToKeep.append(col in selectedInputFields)

submission_matrix = submission_matrix.iloc[:, colsToKeep]
"""

'\nfor col in submission_matrix.columns:\n    colsToKeep.append(col in selectedInputFields)\n\nsubmission_matrix = submission_matrix.iloc[:, colsToKeep]\n'

In [24]:
percentageItems = round(100*(pd.notna(submission_matrix).sum(axis = 1)/len(submission_matrix.columns)))
# print((percentageItems > minPercentageItems).sum())
# We keep only those accounts that submitted a minimum percentage of items
minItems_submission_matrix = submission_matrix[percentageItems > minPercentageItemsSubmitted]

In [25]:
minItems_submission_matrix

Unnamed: 0,assignment_1,assignment_2,assignment_3,assignment_4,assignment_5
1,2014-06-21T07:53:23.762423+00:00,2014-06-21T08:04:24.962730+00:00,2014-06-21T08:06:12.558066+00:00,2014-06-21T08:04:24.962730+00:00,2014-06-21T08:21:58.590764+00:00
2,2014-05-31T07:59:57.231538+00:00,2014-05-31T08:00:35.057636+00:00,2014-05-31T08:00:35.057636+00:00,2014-05-31T08:00:35.057636+00:00,2014-05-31T08:01:58.247444+00:00
3,2014-05-30T07:50:41.822814+00:00,2014-05-30T07:58:13.736850+00:00,2014-05-30T07:58:13.736850+00:00,2014-05-30T07:55:19.917861+00:00,2014-05-30T16:05:37.965483+00:00
6,2014-05-30T02:11:59.165730+00:00,2014-05-30T02:13:25.718455+00:00,2014-05-30T02:15:58.231386+00:00,2014-05-30T02:17:31.262157+00:00,2014-05-30T02:23:31.771410+00:00
7,2014-07-25T09:31:45.709167+00:00,2014-07-25T09:31:21.654261+00:00,2014-07-25T09:31:21.654261+00:00,2014-07-25T09:31:21.654261+00:00,
11,2014-06-02T15:47:23.639336+00:00,2014-06-02T15:48:36.508434+00:00,2014-06-02T15:55:32.113046+00:00,2014-06-02T16:02:56.827037+00:00,2014-06-02T16:00:44.603006+00:00
12,2014-06-01T19:57:18.553506+00:00,2014-06-01T19:59:15.103679+00:00,2014-06-01T19:59:15.103679+00:00,2014-06-01T19:59:15.103679+00:00,2014-06-01T20:01:55.874304+00:00
16,2014-05-09T00:27:05.312021+00:00,2014-05-09T00:31:12.552359+00:00,2014-05-09T00:31:44.376429+00:00,2014-05-09T00:31:44.376429+00:00,
17,2014-06-06T02:42:21.433137+00:00,2014-06-06T03:01:48.588642+00:00,2014-06-06T03:01:48.588642+00:00,2014-06-06T03:03:22.699932+00:00,2014-06-06T03:08:57.949438+00:00
19,2014-06-20T12:55:26.343911+00:00,2014-06-22T16:57:18.983302+00:00,2014-06-22T16:58:43.640375+00:00,2014-06-22T16:58:43.640375+00:00,2014-06-22T17:19:59.651668+00:00


In [26]:
# Calculates the distance between two arrays of submissions timestamps:
# arg1: Array of submissions 1
# arg2: Array of submissions 2
# distance_metric: The distance metric we want to compute choice in {absolute_difference, squared_differences, cosine, euclidean}
# return: The distance in minutes beteween the two arrays of submissions for the chosen metric
def compute_distance(submissions1, submissions2, distance_metric='absolute_difference', onlyCommonItems=True):
    
    # Here I adapt the two arrays of submissions to take into account only the common items
    if(onlyCommonItems):
        commonMask = (pd.notna(submissions1) & pd.notna(submissions2))      
        percentageCommonItems = 100*(sum(commonMask)/len(submissions1))
        submissions1 = pd.to_datetime(submissions1[commonMask])
        submissions2 = pd.to_datetime(submissions2[commonMask])
        
    if(percentageCommonItems < minPercentageCommonItems):
        return np.nan    
    
    if(distance_metric == 'absolute_difference'):
        distance = np.absolute(submissions1 - submissions2).sum()
    elif(distance_metric == 'mean_absolute_difference'):
        distance = np.absolute(submissions1 - submissions2).sum()/len(submissions1)
    elif(distance_metric == 'squared_differences'):
        distance = pow((submissions1 - submissions2),2).sum()
    elif(distance_metric == 'mean_squared_differences'):
        distance = pow((submissions1 - submissions2),2).sum()/len(submissions1)
    elif(distance_metric == 'euclidean'):
        distance = scipy.spatial.distance.euclidean(submissions1, submissions2)
    elif(distance_metric == 'cosine'):
        distance = scipy.spatial.distance.cosine(submissions1, submissions2)
    
    return round(distance.total_seconds()/60)

In [27]:
# Example of distance computation between two accounts
print(compute_distance(minItems_submission_matrix.iloc[1], minItems_submission_matrix.iloc[4], 'mean_absolute_difference'))

79291


In [28]:
# Creating distances matrix in minutes
distance_matrix = pd.DataFrame()
print("Starting...")
for index, studentCol in enumerate(minItems_submission_matrix.index):
    if (index % 50) == 0:
        print(index)
    distancesStudentCol = []
    for studentRow in minItems_submission_matrix.index:
        distancesStudentCol.append(compute_distance(minItems_submission_matrix.loc[studentCol], 
                                                    minItems_submission_matrix.loc[studentRow], 
                                                    distance_metric))
    distance_matrix[studentCol] = distancesStudentCol

print("...Finished!")
distance_matrix.index = minItems_submission_matrix.index

Starting...
0
50
100
...Finished!


In [29]:
distance_matrix

Unnamed: 0,1,2,3,6,7,11,12,16,17,19,...,260,261,267,268,270,272,273,274,275,277
1,0.0,30245.0,31592.0,32030.0,49049.0,26891.0,28087.0,62372.0,21906.0,1807.0,...,15696.0,9923.0,23464.0,16119.0,31373.0,24887.0,32553.0,3478.0,29797.0,29954.0
2,30245.0,0.0,1347.0,1784.0,79291.0,3354.0,2159.0,32130.0,8339.0,31597.0,...,22503.0,20325.0,6777.0,14126.0,1131.0,5359.0,2308.0,26764.0,449.0,292.0
3,31592.0,1347.0,0.0,437.0,80736.0,4701.0,3506.0,30685.0,9686.0,32944.0,...,23415.0,21647.0,8222.0,15473.0,314.0,6682.0,960.0,28208.0,1772.0,1639.0
6,32030.0,1784.0,437.0,0.0,81077.0,5139.0,3943.0,30344.0,10123.0,33382.0,...,23877.0,22108.0,8563.0,15910.0,655.0,7143.0,523.0,28549.0,2232.0,2076.0
7,49049.0,79291.0,80736.0,81077.0,0.0,75938.0,77133.0,111421.0,70954.0,47854.0,...,,,72514.0,65167.0,80422.0,,81599.0,52528.0,,79023.0
11,26891.0,3354.0,4701.0,5139.0,75938.0,0.0,1196.0,35483.0,4985.0,28243.0,...,20824.0,16969.0,3424.0,10772.0,4484.0,2005.0,5662.0,23410.0,2907.0,3062.0
12,28087.0,2159.0,3506.0,3943.0,77133.0,1196.0,0.0,34288.0,6180.0,29439.0,...,21423.0,18166.0,4619.0,11967.0,3289.0,3200.0,4466.0,24605.0,1710.0,1867.0
16,62372.0,32130.0,30685.0,30344.0,111421.0,35483.0,34288.0,0.0,40467.0,63567.0,...,,,38907.0,46254.0,30999.0,,29822.0,58894.0,,32398.0
17,21906.0,8339.0,9686.0,10123.0,70954.0,4985.0,6180.0,40467.0,0.0,23258.0,...,18326.0,11982.0,1560.0,5787.0,9468.0,2980.0,10646.0,18427.0,7890.0,8047.0
19,1807.0,31597.0,32944.0,33382.0,47854.0,28243.0,29439.0,63567.0,23258.0,0.0,...,14919.0,11898.0,24660.0,17471.0,32568.0,26083.0,33905.0,4674.0,30994.0,31305.0


In [30]:
# Save distance matrix
# distance_matrix.to_csv("distance_matrix_" + distance_metric + "_" + selectedItems +".csv", sep=";", header=True, na_rep=np.nan, index=False)

In [31]:
# Load distance matrix
# distance_matrix = pd.read_csv("distance_matrix_" + distance_metric + "_" + selectedItems +".csv", sep=";")

In [32]:
# Create mask for unique triplets {student1, student2, distance} --
mask = np.ones(distance_matrix.shape, dtype=bool)
mask[np.triu_indices(len(distance_matrix))] = False
#distance_matrix = distance_matrix[mask] --> # THIS WAS THROWING A MEMORY LEAK, IM DOING A SIMPLE LOOP INSTEAD

In [33]:
counter = 0
studentIDs = list(distance_matrix.columns.values)

students1 = []
students2 = []
distances = []

tripletsDataframe = pd.DataFrame()
print("Starting...")

for index, celBool in np.ndenumerate(mask):
    counter += 1
    if (counter % 20000) == 0:
        print(str((counter/float(distance_matrix.size))*100) + "%")
    if(celBool):
        students1.append(studentIDs[index[0]])
        students2.append(studentIDs[index[1]])
        distances.append(distance_matrix.iloc[index[0],index[1]])

tripletsDataframe['students1'] = students1
tripletsDataframe['students2'] = students2
tripletsDataframe['distances'] = distances

print("...Finished!")

Starting...
...Finished!


In [34]:
# Example: Triplets with distances below timeThreshold
tripletsDataframe[tripletsDataframe['distances'] < timeThreshold]

Unnamed: 0,students1,students2,distances
101,27,20,23.0
813,85,67,4.0
956,91,20,9.0
1186,104,20,8.0
1220,104,91,1.0
1775,128,11,6.0
2344,143,140,13.0
4589,204,62,7.0
4686,208,64,1.0
4826,211,152,20.0


In [35]:
#tripletsDataframe.to_csv("triplets_matrix_" + distance_metric + "_" + selectedItems +".csv", sep=";", header=True, index=False)

In [36]:
#tripletsDataframe[tripletsDataframe['distances'] < 90].to_csv("selected_triplets_" + distance_metric + "_" + selectedItems +".csv", sep=";", header=True, index=False)