forked from jdaries/de_id
-
Notifications
You must be signed in to change notification settings - Fork 14
/
courses.py
69 lines (58 loc) · 1.81 KB
/
courses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#########
# Python 2.7 script to pull the unique courses, by year and not by year
# Usage: python courses.py INFILE
#########
import pandas as pd
import numpy as np
import sys
# error checking on usage
if len(sys.argv) != 2:
print ("USAGE: courses.py INFILE\n")
exit(1)
if sys.argv[1][-4:] != ".csv":
print (sys.argv[1] + " must be a .csv file\n")
exit(2)
inf = open(sys.argv[1], "r")
if not inf:
print ("Could not open " + sys.argv[1] + "\n")
exit(3)
inf.close()
df = pd.read_csv(sys.argv[1])
if "course_id" not in list(df):
print ("Must have column named \"course_id\"\n")
exit(4)
# strip year information (so CS50 2016 and 2015 are considered the same)
def strip_year(row):
course = row['course_id']
s = course.rfind("/")
course = course[:s]
d = course.rfind(".")
s = course.rfind("/")
if d > s:
course = course[:d]
return course
def keep_year_strip_modules(row):
course = row['course_id']
s = course.rfind("/")
acc = course[s:]
course = course[:s]
d = course.rfind(".")
s = course.rfind("/")
if d > s:
course = course[:d]
course += acc
return course
# now get unique courses
courses = df['course_id'].unique()
courses = pd.DataFrame(courses)
courses.columns = ['course_id']
courses['course_id_no_module'] = courses.apply(lambda row: keep_year_strip_modules(row), axis = 1)
courses = courses['course_id_no_module'].unique()
courses = pd.DataFrame(courses)
courses.columns = ['course_id']
courses.to_csv('courses.csv', index = False)
courses['course_id_no_year'] = courses.apply(lambda row: strip_year(row), axis = 1)
courses_no_year = courses['course_id_no_year'].unique()
courses_no_year = pd.DataFrame(courses_no_year)
courses_no_year.columns = ['course_id_no_year']
courses_no_year.to_csv('courses_no_year.csv', index = False)