In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import pickle
import numpy as np

In [2]:
file_path = '701_anon.csv'
df = pd.read_csv(file_path)
cols = len(df.columns)
prefType = 'LinNorm' # 'Exp', 'LinNorm'

In [3]:
df.columns

Index(['Are you interested in being a Team Representative? ',
       '59f85917f60e41e5e6613affb3c5cf08', '75fa835c10c06ca20a5cb18840bcc148',
       '94c1d8d6efd12aca34cf85faff366322', '7f862ceb4514e29f478e1e5154d8e19b',
       'e0c95d01bed5b12ad176754dcc09eebc', 'da9b65da86aa5d93868c8ef135f5633d',
       '11bf953de1068c3a1935afec294bdc42', '496f9ba8de22f1bdb6469bc5e72462d7',
       'Mondays', 'Tuesdays', 'Wednesdays', 'Thursdays', 'Fridays', 'id name',
       'email anonymized', 'anonymized group preference'],
      dtype='object')

In [4]:
numOfProjects = 8
columns_to_drop = ['id name', 'Mondays', 'Tuesdays', 'Wednesdays', 'Thursdays', 'Fridays']
df_dropped_multiple = df.drop(columns=columns_to_drop)

# If you want to modify the DataFrame in place, use inplace=True
df.drop(columns=columns_to_drop, inplace=True)


column_name_mapping = {
    df.columns[0]: 'representative',
    df.columns[-2]: 'id',
    df.columns[-1]: 'friends'  
}

for i in range(1, numOfProjects+1):
    column_name_mapping[df.columns[i]] = df.columns[i][-5:]+'A'

df.rename(columns=column_name_mapping, inplace=True)

In [5]:
df.columns

Index(['representative', '5cf08A', 'cc148A', '66322A', '8e19bA', '9eebcA',
       '5633dA', 'bdc42A', '462d7A', 'id', 'friends'],
      dtype='object')

In [6]:
df['representative'].astype(str)
df['representative'] = df['representative'].map({'No': 0, 'Maybe': 1, 'Yes': 1})
# df = df.drop_duplicates(subset='id')


In [7]:
n = df.shape[0]
print(f'num of rows = {n}')

num of rows = 37


In [8]:
# Add teams for the same project
def replicate_projects(projects, k):
    letters = ['B', 'C', 'D', 'E', 'F', 'G']
    for p in projects:
        root_proj = p + 'A'
        for i in range(k-1):
            df[p+letters[i]] = df[root_proj]
            
uniqueProjects = [p[:-1] for p in list(df.columns[1:numOfProjects+1])]
replicate_projects(['5cf08', '9eebc', '462d7'], 2)

In [9]:
df.columns

Index(['representative', '5cf08A', 'cc148A', '66322A', '8e19bA', '9eebcA',
       '5633dA', 'bdc42A', '462d7A', 'id', 'friends', '5cf08B', '9eebcB',
       '462d7B'],
      dtype='object')

In [10]:
columns = sorted([col for col in list(df.columns) if col not in ['id', 'representative', 'friends']])
columns += [ 'id', 'representative', 'friends']
df = df[columns]
df.columns

Index(['462d7A', '462d7B', '5633dA', '5cf08A', '5cf08B', '66322A', '8e19bA',
       '9eebcA', '9eebcB', 'bdc42A', 'cc148A', 'id', 'representative',
       'friends'],
      dtype='object')

In [11]:
numOfUniqueProjects = 8
numOfReplicatedProjects = len(df.columns) - 3
print(f'# replicated projects = {numOfReplicatedProjects}')
projects = list(df.columns[:numOfReplicatedProjects])

# replicated projects = 11


In [12]:
c = dict()
G = nx.Graph()
nodes = [u[-5:] for u in list(df['id'])]
assignment = dict()
representatives = dict()
for index, row in df.iterrows():
    id = str(row['id'])[-5:]
    preferences = [int(row[p][:-2]) for p in projects]
    for p, pref in zip(projects, preferences):
        if prefType == 'Exp':
            c[(id, p)] = 1 / pref
        elif prefType == 'Lin':
            c[(id, p)] = (numOfUniqueProjects - pref + 1)
        elif prefType == 'LinNorm':
            c[(id, p)] = (numOfUniqueProjects - pref + 1) / numOfUniqueProjects
    
    # friends
    friends = row['friends'].strip("[]").replace("'", "").split(",")
    for f in friends:
        if f and not f.isspace():
            G.add_edge(id, f[-5:])
            
    # represenatitive
    representatives[id] = row['representative']
    
G.add_nodes_from(nodes)
G_c = nx.complement(G)
w = dict()
for e in G_c.edges:
    w[e] = 1
edges = list(G_c.edges)
len(edges)

648

In [13]:
print(f"number of unique students = {len(G.nodes)}")
print(f'number of (friends) edges = {len(G.edges)}')
print(f'number of conflict edges (include time conflicts) = {len(G_c.edges)}')

number of unique students = 37
number of (friends) edges = 18
number of conflict edges (include time conflicts) = 648


In [14]:
max_capacities = dict()
for p in projects:
    max_capacities[p] = 5

In [15]:
with open(f'./csDS701_{prefType}.pickle', 'wb') as file:
    pickle.dump(projects, file) # project names
    pickle.dump(max_capacities, file) # max capacity for each project
    pickle.dump(c, file) # project preferences
    pickle.dump(w, file) # weights of conflict edges
    pickle.dump(edges, file) # conflict edges
    pickle.dump(representatives, file) # representatives
    file.close()