In [1]:
import pandas as pd
import os
import re
from tqdm import tqdm

In [2]:
folder_directory = 'Downloads/CS5340_Project/moss'
file_name = '1821_1935_2023.csv'

moss_folder = os.path.join(folder_directory, file_name)

In [29]:
moss_data = pd.read_csv(moss_folder)

In [30]:
moss_data.sample(10)

Unnamed: 0,mission,cohort1,assessment1,submission1,name1,cohort2,assessment2,submission2,name2,percent1,percent2,lines,url
2770,hungry_games_training_part_iii,1935,39738,1089876,veronica_gibson,1935,39738,1084442,kimberly_lam,13,11,24,http://moss.stanford.edu/results/1/54197889831...
2348,facebook_stalkers,1821,33195,927799,claudia_moore,1935,38696,1087430,deborah_baker,69,69,150,http://moss.stanford.edu/results/8/95115982680...
2982,cyclic_runes,1935,38695,1025640,caitlin_rodriguez,1935,38695,1020655,martin_payne,64,64,65,http://moss.stanford.edu/results/1/86567552350...
224,dna_using_data_directed_programming,1935,39561,1074912,lance_haynes,1935,39561,1074887,karen_walker,22,14,35,http://moss.stanford.edu/results/0/10303382188...
4119,diagnostics,1935,38699,1047841,justin_burton,2023,41920,1154821,tara_bonilla,15,36,24,http://moss.stanford.edu/results/5/26685513583...
1405,book_of_advanced_spells,1935,38723,1030790,kelsey_nguyen,2023,41930,1142463,jonathan_snyder,58,59,16,http://moss.stanford.edu/results/5/88376757115...
2660,hungry_games_training_part_iii,1935,39738,1090420,patricia_baker,1935,39738,1081912,angela_cordova,69,67,38,http://moss.stanford.edu/results/1/54197889831...
1669,to_infinity_and_beyond,1821,33187,912164,mrs._elizabeth_mcdonald_md,1935,38689,1070616,stephanie_mosley,70,80,37,http://moss.stanford.edu/results/8/59530496097...
1165,dna_translation,1935,39560,1079814,vanessa_camacho,2023,41900,1195959,rebecca_russo,35,46,50,http://moss.stanford.edu/results/5/39664034484...
351,circle_manipulation,1935,38686,1041162,victoria_white,1935,38686,1041395,traci_rowland,67,58,13,http://moss.stanford.edu/results/4/30999930665...


In [31]:
moss_data.shape

(4134, 13)

In [32]:
## Name to dict

all_names = set(moss_data.name1.tolist() + moss_data.name2.tolist())
name_to_ID = {name: i for i, name in enumerate(sorted(all_names))}
ID_to_name = {i: name for i, name in enumerate(sorted(all_names))}

In [33]:
len(name_to_ID), name_to_ID['john_cuevas_md'], ID_to_name[476] ## Sanity Check

(1032, 476, 'john_cuevas_md')

In [34]:
## Task 1 (edges)

import torch

pairs = set([(rows.name1, rows.name2) for _, rows in moss_data.iterrows()])
sorted_pairs = sorted(pairs, key=lambda x: (name_to_ID[x[0]], name_to_ID[x[1]]))
tmp_s = list(map(lambda x: name_to_ID[x[0]], sorted_pairs))
tmp_t = list(map(lambda x: name_to_ID[x[1]], sorted_pairs))
src = []
for i, j in zip(tmp_s, tmp_t):
    src.append(i)
    src.append(j)
tgt = []
for i, j in zip(tmp_s, tmp_t):
    tgt.append(j)
    tgt.append(i)
edge_index = torch.tensor([src, tgt], dtype=torch.long)

In [38]:
## Standardise two percentages and extract rank

def get_code_length(percent, num_lines):
    if percent:
        return 100/percent*num_lines
    return 0

def standardise(percent1, percent2, num_lines):
    code_length1 = get_code_length(percent1, num_lines)
    code_length2 = get_code_length(percent2, num_lines)
    return (2 * num_lines)/(code_length1 + code_length2)*100

moss_data['standardised1'] = moss_data.apply(lambda x: standardise(x['percent1'], x['percent2'], x['lines']), axis=1)

pattern = re.compile(r'(match[0-9]+).html') ## Pattern to extract ranks
moss_data['Moss Rank'] = moss_data['url'].apply(lambda x: int(pattern.search(x)[0][5:-5])+1)

In [36]:
## Group and Rank by missions

grouped_moss_data = moss_data.groupby(["mission"])
mission_titles = grouped_moss_data['mission'].unique().index.tolist()
max_rank = dict(zip(grouped_moss_data['Moss Rank'].max().index, grouped_moss_data['Moss Rank'].max()))
mission_averages = grouped_moss_data['standardised1'].mean()
sorted_averages = sorted(zip(mission_averages.index, mission_averages), key=lambda x: x[1])
ranked_averages = dict(map(lambda x: (x[0], len(sorted_averages)-sorted_averages.index(x)), sorted_averages))

In [77]:
## Slice points by MOSS Rank

def points_aggregation(moss_rank, max_rank, points):
    return ((max_rank-moss_rank)/max_rank)*points

def obtain_points(moss_rank, mission, mrank, raverages):
    _max_rank = mrank[mission]
    points = raverages[mission]
    return points_aggregation(moss_rank, _max_rank, points)

moss_data['Points'] = moss_data.apply(lambda x: obtain_points(x['Moss Rank'], x['mission'], max_rank, ranked_averages), axis=1)
groupedby_name_pairs = moss_data.groupby(['name1', 'name2'])

In [78]:
groupedby_name_pairs.head()

Unnamed: 0,mission,cohort1,assessment1,submission1,name1,cohort2,assessment2,submission2,name2,percent1,percent2,lines,url,standardised1,Moss Rank,Points
0,beautiful_runes,1821,33159,874054,edward_walker,1935,38713,1025319,andrea_davis,1,32,19,http://moss.stanford.edu/results/1/88557058395...,1.939394,1,27.813333
1,beautiful_runes,1935,38713,1035686,lisa_reese,1935,38713,1027426,joshua_hernandez,84,66,16,http://moss.stanford.edu/results/1/88557058395...,73.920000,2,27.626667
2,beautiful_runes,1935,38713,1027335,tara_bonilla,2023,42731,1141632,tara_bonilla,84,77,11,http://moss.stanford.edu/results/1/88557058395...,80.347826,3,27.440000
3,beautiful_runes,1935,38713,1035686,lisa_reese,1935,38713,1026134,carol_harris,66,47,12,http://moss.stanford.edu/results/1/88557058395...,54.902655,4,27.253333
4,beautiful_runes,1935,38713,1027426,joshua_hernandez,1935,38713,1026134,carol_harris,51,47,12,http://moss.stanford.edu/results/1/88557058395...,48.918367,5,27.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4129,diagnostics,1935,38699,1046432,debbie_fernandez,1935,38699,1049862,karen_mcdaniel,39,47,37,http://moss.stanford.edu/results/5/26685513583...,42.627907,146,0.373333
4130,diagnostics,1935,38699,1046432,debbie_fernandez,1935,38699,1041562,james_long,39,55,41,http://moss.stanford.edu/results/5/26685513583...,45.638298,147,0.280000
4131,diagnostics,1935,38699,1043503,william_smith,1935,38699,1049862,karen_mcdaniel,50,47,37,http://moss.stanford.edu/results/5/26685513583...,48.453608,148,0.186667
4132,diagnostics,1935,38699,1043503,william_smith,1935,38699,1041562,james_long,50,55,41,http://moss.stanford.edu/results/5/26685513583...,52.380952,149,0.093333


In [13]:
grouped_moss_data.head()

Unnamed: 0,mission,cohort1,assessment1,submission1,name1,cohort2,assessment2,submission2,name2,percent1,percent2,lines,url,standardised1
0,beautiful_runes,1821,33159,874054,edward_walker,1935,38713,1025319,andrea_davis,1,32,19,http://moss.stanford.edu/results/1/88557058395...,1.939394
1,beautiful_runes,1935,38713,1035686,lisa_reese,1935,38713,1027426,joshua_hernandez,84,66,16,http://moss.stanford.edu/results/1/88557058395...,73.920000
2,beautiful_runes,1935,38713,1027335,tara_bonilla,2023,42731,1141632,tara_bonilla,84,77,11,http://moss.stanford.edu/results/1/88557058395...,80.347826
3,beautiful_runes,1935,38713,1035686,lisa_reese,1935,38713,1026134,carol_harris,66,47,12,http://moss.stanford.edu/results/1/88557058395...,54.902655
4,beautiful_runes,1935,38713,1027426,joshua_hernandez,1935,38713,1026134,carol_harris,51,47,12,http://moss.stanford.edu/results/1/88557058395...,48.918367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3984,diagnostics,1935,38699,1047986,john_cuevas_md,2023,41920,1161897,john_cuevas_md,76,58,83,http://moss.stanford.edu/results/5/26685513583...,65.791045
3985,diagnostics,1935,38699,1049111,jacqueline_middleton,2023,41920,1168119,jacqueline_middleton,97,97,71,http://moss.stanford.edu/results/5/26685513583...,97.000000
3986,diagnostics,1935,38699,1045810,tara_bonilla,2023,41920,1154821,tara_bonilla,88,88,69,http://moss.stanford.edu/results/5/26685513583...,88.000000
3987,diagnostics,1935,38699,1053668,sherry_everett,2023,41920,1159432,leah_pena,54,74,55,http://moss.stanford.edu/results/5/26685513583...,62.437500


In [None]:
x = []
code_length = []
exceptions = []
for _, row in moss_data.iterrows():
    if row.percent1 and row.percent2:
        code_length.append((100/row.percent1*row.lines, 100/row.percent2*row.lines))
    else:
        exceptions.append(row)

In [68]:
code_length[0]

(1900.0, 59.375)

In [44]:
edge_index

tensor([[   0,  514,    0,  ...,  258, 1031,  968],
        [ 514,    0,  644,  ..., 1031,  968, 1031]])

In [None]:
from torch_geometric.data import Data



In [69]:
folder_directory = 'Downloads/CS5340_Project/true_labels'
# file_name = '2023.csv'
# file_name = '1935.csv'
file_name = '1821.csv'

tl_folder = os.path.join(folder_directory, file_name)

In [70]:
tl_data = pd.read_csv(tl_folder)

In [None]:
tl_data

In [71]:
## Task 2 & 3 (Check Moss Results)

def checker(name):
    if name in name_to_ID.keys():
        return name_to_ID[name]
    return 'Name Not Found Error'

labels = tl_data[['Name', 'Please indicate the assignments']]
labels['ID'] = labels['Name'].apply(lambda x: checker(x))
names_not_found = labels[labels['ID'] == 'Name Not Found Error']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [72]:
labels.head(n=30)

Unnamed: 0,Name,Please indicate the assignments,ID
0,frank_martin,"curve_manipulation,rogue_train",316
1,matthew_cohen,"curve_manipulation,rogue_train",677
2,jimmy_bennett,"3d_runes,curve_introduction,curve_manipulation...",Name Not Found Error
3,alexis_robinson,"rogue_train,lazy_susan",21
4,jeffrey_ruiz,"curve_manipulation,dragonize",417
5,andrea_bennett,rune_reading,Name Not Found Error
6,jody_clark,lazy_susan,469
7,john_mcbride,"cyclic_runes,advanced_spells,curve_introductio...",Name Not Found Error
8,elizabeth_kennedy,"cyclic_runes,circle_manipulation",289
9,michelle_watson,diagnostics,741


In [73]:
names_not_found

Unnamed: 0,Name,Please indicate the assignments,ID
2,jimmy_bennett,"3d_runes,curve_introduction,curve_manipulation...",Name Not Found Error
5,andrea_bennett,rune_reading,Name Not Found Error
7,john_mcbride,"cyclic_runes,advanced_spells,curve_introductio...",Name Not Found Error
30,justin_ramirez,"curve_manipulation,rogue_train",Name Not Found Error
49,kristina_stephenson,curve_manipulation,Name Not Found Error


In [74]:
names_not_found.to_csv(os.path.join(folder_directory, 'missing_names_'+file_name), index=False)

In [None]:
## Task 4 (NetworkX)



In [11]:
## Task 5 (Edge_Index)

pattern = re.compile(r'(match[0-9]+).html')

edge_index = pd.DataFrame([], columns=['nodeid', 'mission', 'moss_ranking', 'edge_weights'])
for name in tqdm(all_names):
    idee = name_to_ID[name]
    relevant_rows = moss_data[(moss_data.name1 == name) | (moss_data.name2 == name)]
    for index, item in relevant_rows.iterrows():
        edge_index = edge_index.append({'nodeid': idee, 
                                        'mission': item.mission, 
                                        'moss_ranking': pattern.search(item.url)[0][5:-5], 
                                        }, ignore_index=True)

100%|██████████| 1032/1032 [00:25<00:00, 40.38it/s]


In [9]:
relevant_rows.head()

Unnamed: 0,mission,cohort1,assessment1,submission1,name1,cohort2,assessment2,submission2,name2,percent1,percent2,lines,url
510,funky_merge_spell,1821,33174,952434,morgan_williams,1935,38725,1059628,james_long,54,50,50,http://moss.stanford.edu/results/9/55540043683...
904,lazy_susan,1821,33173,951628,morgan_williams,2023,41931,1170102,jennifer_washington,91,90,103,http://moss.stanford.edu/results/0/56645335735...
909,lazy_susan,1821,33173,951628,morgan_williams,1935,38724,1048729,kenneth_allen,74,77,93,http://moss.stanford.edu/results/0/56645335735...
912,lazy_susan,1821,33173,951628,morgan_williams,1935,38724,1062843,corey_bender,63,69,92,http://moss.stanford.edu/results/0/56645335735...
916,lazy_susan,1821,33173,912554,michael_gomez,1821,33173,951628,morgan_williams,59,61,49,http://moss.stanford.edu/results/0/56645335735...


In [10]:
edge_index.head()

Unnamed: 0,NodeID,Mission,MOSS Ranking
0,822,dna_translation,11
1,822,dna_translation,67
2,822,dna_translation,126
3,822,what_sort_of_sorcery_is_this,116
4,822,kochize,10
