-
Notifications
You must be signed in to change notification settings - Fork 10
/
match-list-a-list-b.py
81 lines (69 loc) · 2.46 KB
/
match-list-a-list-b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import csv
from fuzzywuzzy import fuzz
# List B
b_ipeds = set()
b_ipeds_used = set()
b_dict = dict()
# master
with open('master-with-sat-debt-and-default.csv', 'rU') as csvfile:
reader = csv.reader(csvfile)
master_header = next(reader)
for row in reader:
b_ipeds.add(int(row[0]))
b_dict[int(row[0])] = row
outfile = open('a-vs-b.csv', 'w')
writer = csv.writer(outfile)
empty_row = [''] * 23
# vs output-final
with open('output-final.csv', 'rU') as csvfile:
reader = csv.reader(csvfile)
writer.writerow( ['u_id', 'IPEDS ID', 'Freebase id', 'Name', 'Median SAT', 'Graduation rate', 'Retention rate'] + master_header + ['Name Score', 'Reasonable SAT', 'Difference Graduation', 'Difference Retention'] )
for row in reader:
new_row = [ row[0], row[2], row[1], row[4], row[6], row[7], row[8] ]
if (row[2] != '') and (int(row[2]) in b_ipeds):
new_row = new_row + b_dict[int(row[2])]
b_ipeds_used.add(int(row[2]))
else:
new_row = new_row + empty_row
if row[2].isdigit():
b_row = b_dict[int(row[2])]
ratio = fuzz.token_sort_ratio(row[4], b_row[1])
new_row = new_row + [ratio]
if b_row[9].isdigit() and row[6]!='':
total_25 = (int(b_row[9]) + int(b_row[11]))
total_75 = (int(b_row[10]) + int(b_row[12]))
print total_25
print total_75
print row[6]
print (int(row[6]) >= total_25)
print (int(row[6]) <= total_75)
if (int(row[6]) >= total_25) and (int(row[6]) <= total_75):
new_row = new_row + ['YES']
print 'YES'
else:
new_row = new_row + ['NO']
print 'NO'
elif b_row[9]=='' and row[6]=='':
new_row = new_row + ['NO DATA']
else:
new_row = new_row + ['']
if b_row[17].isdigit() and row[7].isdigit():
new_row = new_row + [ str( abs(int(row[7]) - int(b_row[17])) ) ]
elif b_row[17]=='' and row[7]=='':
new_row = new_row + ['NO DATA']
else:
new_row = new_row + ['']
if b_row[18].isdigit() and row[8].isdigit():
new_row = new_row + [ str( abs(int(row[8]) - int(b_row[18])) )]
elif b_row[17].isdigit() and row[7].isdigit():
new_row = new_row + ['NO DATA']
else:
new_row = new_row + [''];
writer.writerow(new_row)
outfile = open('new-colleges.csv', 'w')
writer = csv.writer(outfile)
ipeds_left = b_ipeds - b_ipeds_used
for i in ipeds_left:
# row = [''] * 8 + b_dict[i]
row = b_dict[i]
writer.writerow(row)