In [61]:
import argparse

parser = argparse.ArgumentParser(description='Sort freshmen by house.')
parser.add_argument('file_input', metavar='filename.csv', nargs='?',
                    help='csv file as input')
parser.add_argument('file_output', metavar='filename2.csv', nargs='?',
                    help='csv file for output')
parser.add_argument('house_names', metavar='houses.txt', nargs='?',
                    help='line by line input for house names')
parser.add_argument('-g', metavar='min max', type=float, nargs=2,
                    default=[0.45,0.55],
                    help='acceptable gender coefficient')
parser.add_argument('-s', metavar='distribution niche', type=float, nargs=2,
                    default=[0.2,0.8],
                    help='acceptable school coefficient for distributing students' +
                    'from the same school and congregating students from niche schools')
parser.add_argument('-m', metavar='distribution niche', type=float, nargs=2,
                    default=[0.2,1],
                    help='acceptable major coefficient for distributing students' +
                    'from the same major and congregating students from niche major')
args = parser.parse_args(["test.csv", "test2.csv","houses.txt"])


In [62]:
houses = []
with open(args.house_names) as f:
    for line in f:
        houses.append(line.strip())
f.close()
print(houses)

assert len(houses) >= 4

['Ursaia', 'Nocturna', 'Ianthe', 'Triton', 'Ankaa', 'Saren']


In [63]:
import pandas
# Read CSV

df = pandas.read_csv(args.file_input)
df['Hash'] = [hash(i) for i in df['Name']]
df = df.sort_values(by='Hash')
df

Unnamed: 0,Name,Gender,School,Faculty,Major,Hash
8,Illidian Ip,Male,Y Junior College,FoS,Pharmacy,-8536546584642155011
1,Betty Bee,Female,X Junior College,FASS,History,-6312769914841447406
7,Herbert Hu,Male,X Junior College,FoE,Mechanical Engineering,-1920888423604945108
9,Jannsen Johnson,Male,W Polytechnic,FASS,Literature,-958881578606915962
3,Dolly Deng,Female,X Junior College,SoC,Computer Science,926948616573148500
5,Fabian Foo,Male,Z Junior College,Law,Law,1514244043207284119
10,Ketisha Kelu,Female,V Junior College,SDE,Architecture,3514580074958986449
6,Gillian Goh,Female,W Polytechnic,FoE,Chemical Engineering,6542032509206699316
11,Lambert Lu,Male,V Junior College,SoC,Information Systems,7690880726310297297
12,Muhammad Mohsin,Male,X Junior College,SoC,Computer Science,8285112770872632041


In [64]:
import collections

    schools = collections.Counter(df['School'])
    maximum = max(schools, key=schools.get)
    school_coeffs = {key:(schools[key]-2)/(schools[maximum]-2)*(args.s[1]-args.s[0])+args.s[0] for key in schools}
    print(school_coeffs)

    majors = collections.Counter(df['Major'])
    maximum = max(majors, key=majors.get)
    major_coeffs = {key:(majors[key]-2)/(majors[maximum]-2)*(args.m[1]-args.m[0])+args.m[0] for key in majors}
    print(major_coeffs)

{'Y Junior College': 0.2, 'X Junior College': 0.8, 'W Polytechnic': 0.2, 'Z Junior College': 0.2, 'V Junior College': 0.2}
{'Pharmacy': -0.6000000000000001, 'History': 0.2, 'Mechanical Engineering': -0.6000000000000001, 'Literature': -0.6000000000000001, 'Computer Science': 1.0, 'Law': -0.6000000000000001, 'Architecture': -0.6000000000000001, 'Chemical Engineering': -0.6000000000000001, 'Information Systems': -0.6000000000000001, 'Chemistry': -0.6000000000000001}


In [65]:
def dir(x):
    return min(x,1-x)
def mag(x):
    return max(x-0.5,0.5-x)

In [66]:
# Populate adjacency list

n = len(df['Hash'])
m = len(houses)

adjList = []
for i in df.itertuples():
    row = {}
    for j in df.itertuples():
        magnitude = (mag(school_coeffs[i[3]]) if i[3] == j[3] else 0) + (mag(major_coeffs[i[5]]) if i[5] == j[5] else 0)
        direction = (dir(school_coeffs[i[3]]) if i[3] == j[3] else 0) + (dir(major_coeffs[i[5]]) if i[5] == j[5] else 0)
        #print(magnitude,direction)
        row[j[6]] = 0.5 + magnitude*direction*(2*n)
    adjList.append(row)
    
print(adjList)

[{-8536546584642155011: -14.060000000000004, -6312769914841447406: 0.5, -1920888423604945108: 0.5, -958881578606915962: 0.5, 926948616573148500: 0.5, 1514244043207284119: 0.5, 3514580074958986449: 0.5, 6542032509206699316: 0.5, 7690880726310297297: 0.5, 8285112770872632041: 0.5, 8331591614551658076: 0.5, 8702395053916699804: 2.06, 9125360881890315998: 0.5}, {-8536546584642155011: 0.5, -6312769914841447406: 6.74, -1920888423604945108: 2.06, -958881578606915962: 0.5, 926948616573148500: 2.06, 1514244043207284119: 0.5, 3514580074958986449: 0.5, 6542032509206699316: 0.5, 7690880726310297297: 0.5, 8285112770872632041: 2.06, 8331591614551658076: 6.74, 8702395053916699804: 0.5, 9125360881890315998: 0.5}, {-8536546584642155011: 0.5, -6312769914841447406: 2.06, -1920888423604945108: -14.060000000000008, -958881578606915962: 0.5, 926948616573148500: 2.06, 1514244043207284119: 0.5, 3514580074958986449: 0.5, 6542032509206699316: 0.5, 7690880726310297297: 0.5, 8285112770872632041: 2.06, 83315916145

In [67]:
# Colouring Algorithm 1

hashes = [x for x in df['Hash']]
colours = [-1] * n
           
colours[0] = 0
for i in range(1,n):
    count = {j:colours.count(j) for j in range(0,6)}
    count_sorted = [x[0] for x in sorted(count.items(), key=lambda count: count[1]+count[0]/m)]
    temp = [0] * m
    for index, j in enumerate(count_sorted):
        temp[j] += (m - index - 1)
    for j in range(0,i):
        temp[colours[j]] += adjList[i][hashes[j]]
    for x in temp:
        if x in count and count[x]!=0:
            x = x/count[x]
    colours[i] = sorted(enumerate(temp), key=lambda x: x[1], reverse=True)[0][0]
    
df['Houses'] = [houses[x] for x in colours]
df = df.sort_index()
df

Unnamed: 0,Name,Gender,School,Faculty,Major,Hash,Colours
0,Aaron Ang,Male,X Junior College,FASS,History,8331591614551658076,Nocturna
1,Betty Bee,Female,X Junior College,FASS,History,-6312769914841447406,Nocturna
2,Charlie Chan,Male,Y Junior College,SoC,Computer Science,8702395053916699804,Ursaia
3,Dolly Deng,Female,X Junior College,SoC,Computer Science,926948616573148500,Ankaa
4,Evangeline Ee,Female,Z Junior College,FoS,Chemistry,9125360881890315998,Saren
5,Fabian Foo,Male,Z Junior College,Law,Law,1514244043207284119,Saren
6,Gillian Goh,Female,W Polytechnic,FoE,Chemical Engineering,6542032509206699316,Nocturna
7,Herbert Hu,Male,X Junior College,FoE,Mechanical Engineering,-1920888423604945108,Ianthe
8,Illidian Ip,Male,Y Junior College,FoS,Pharmacy,-8536546584642155011,Ursaia
9,Jannsen Johnson,Male,W Polytechnic,FASS,Literature,-958881578606915962,Triton


In [71]:
# Output
df = df.drop(columns=['Hash'])
df.to_csv(args.file_output)