Done By: **Ryan Yeo**

Class: **DAAA/FT/1B/01**

Admin Num: **P2214452**

In [1]:
import numpy as np
import os

# Reformating datasets

In [2]:
# Create directory if it exists print err
try:
    os.mkdir('datasets_cleaned')
except OSError as error:
    print(error)  

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned'


### Employment

In [3]:
# Clean employment dataset

# Note:
# We cannot use genfromtxt directly because it reads commas contained in strings(unlike csvreader and pd.read_csv) 
# To avoid that, we first read in all the data seperated by a newline before processing it

dirtyData_17to19 = np.genfromtxt('datasets_src/employment/emp_17to19.csv', dtype="U64",delimiter="\n")
dirtyData_19to21 = np.genfromtxt('datasets_src/employment/emp_19to21.csv', dtype="U64",delimiter="\n")

In [4]:
# Convert and clean the dataset so that it can be written to datasets_cleaned
def cleanData(dirtyArr):
    employ_arr = []
    for i in dirtyArr:
        _ = []
        inQuotes = False
        for j,n in enumerate(i):
            if n=='\"':
                # If opening quotes => True elif closing quotes => False
                inQuotes=not inQuotes
            if n==',' and inQuotes:
                # If it's used in a string, change it to a backtick
                # This is for the sole purpose of not causing any error when reading as csv
                # When printing from this column, backticks will be changed back to commas
                if i[j+1]==' ':
                    _.append('`')
                # If it's used in money, (e.g. $3,600) just remove the comma
                else:
                    _.append('')
            else:
                _.append(n)
        employ_arr.append("".join(_))
    employ_arr = np.array(employ_arr)
    return employ_arr


In [5]:
# We can delete one set of 2019 data since we don't need duplicates from both arr
cleaned_17to19 = cleanData(dirtyData_17to19)[:-((len(cleanData(dirtyData_17to19))-1)//3)]

# We can also delete the header for the second arr
cleaned_19to21 = cleanData(dirtyData_19to21)[1:]

cleaned = np.concatenate((cleaned_17to19,cleaned_19to21))

In [6]:
# Now we can write back the data into datasets_cleaned
try:
    os.mkdir('datasets_cleaned/employment')
except OSError as error:
    print(error)  

cleaned.tofile('datasets_cleaned/employment/employ.csv',sep='\n')

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned/employment'


However at this point our data is still not in the right format yet

Due to our need to manipulate and remove commas that were not seperators, 
we had to cast each row as a string datatype

When writing to a csv file, it will cause quotation marks to appear for each row
Since we don't want that to affect main.ipynb, we would have to reopen the file the format it

This time since we changed the commas, it would be less of a hassle


In [7]:
f=np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# We can also remove the '$' and the '%' in the meantime so that we can easily convert into float later
f = np.char.replace(f,'$','')
f = np.char.replace(f,'%','')


# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/employment/employ.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/employment/employ.csv',f,delimiter=",",fmt='%s')

### Intake
Now we repeat the entire process again for the different intakes (_This includes Poly and Uni_)

In [8]:
# Prior to this, U64 was used and it accidentally cut off data from Poly Intake
dirtyPoly = np.genfromtxt('datasets_src/poly_intake/polytechnics-intake-enrolment-and-graduates-by-course.csv', dtype="U128",delimiter="\n")
dirtyUni = np.genfromtxt('datasets_src/uni_intake/universities-intake-enrolment-and-graduates-by-course.csv',dtype="U128",delimiter="\n")

In [9]:
# Since there is no salary data, we are just replacing commas with backticks using cleanData
cleanPoly = cleanData(dirtyPoly)
cleanUni = cleanData(dirtyUni)

In [10]:
# Poly
try:
    os.mkdir('datasets_cleaned/poly')
except OSError as error:
    print(error)  

# Generates an error if you already have the file opened elsewhere
cleanPoly.tofile('datasets_cleaned/poly/poly_intake.csv',sep='\n')

# Since now each string is one cell of data and not one row, we can go back to using U64
f=np.genfromtxt('datasets_cleaned/poly/poly_intake.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/poly/poly_intake.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/poly/poly_intake.csv',f,delimiter=",",fmt='%s')

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned/poly'


In [11]:
# Uni
try:
    os.mkdir('datasets_cleaned/uni')
except OSError as error:
    print(error)  

# Generates an error if you already have the file opened elsewhere
cleanUni.tofile('datasets_cleaned/uni/uni_intake.csv',sep='\n')

f=np.genfromtxt('datasets_cleaned/uni/uni_intake.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/uni/uni_intake.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/uni/uni_intake.csv',f,delimiter=",",fmt='%s')

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned/uni'


# Cleaning Datasets

### Employment

In [12]:
employEdit = np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',')

Since our employment datasets are from MOE and our intake datasets are from [data.gov.sg](https://www.data.gov.sg/group/education), it is best to rename the column names from both datasets so that it is easier for comparison later on

|       Initial Col Name   | New Col Name |
|----------------------|------|
| Engineering | Engineering |
| Architecture | Architecture |
| Business | Business |
| Information & Digital Technologies | IT |
| Medicine | Medicine |
| Arts, Design & Medias | Arts |
| Dentistry | Dentistry |
| Built environment | Architecture |
|Yale-NUS| *Removed*|
|Biomedical Sciences| Health Sciences |
| Pharmacy | Health Sciences |
| Education (NIE) | Education |
| Music | Arts |
| Humanities & Social Sciences | Arts |
| Health Sciences | Health Sciences |
| Sciences | Sciences |
| Law | Law |

In [13]:
# Rename course name to match the other datasets
for a in employEdit:
    if a[1]=='Built Environment':
        a[1] = 'Architecture'
    elif a[1]=='Arts` Design & Media' or a[1]=='Music' or a[1]=='Humanities & Social Sciences':
        a[1] = 'Arts'
    elif a[1] == 'Information & Digital Technologies':
        a[1] = 'IT'
    elif a[1]=='Biomedical Sciences' or a[1]=='Pharmacy' or a[1]=='Health Sciences':
        a[1] = 'Health Sciences'
    elif a[1] == 'Education(NIE)':
        a[1] = 'Education'

# Remove row if it contains Yale-NUS
counter=0
for i,a in enumerate(employEdit):
    if a[1]=='Yale-NUS':
        employEdit = np.delete(employEdit,i-counter,0)
        counter+=1

print(employEdit)


[['year' 'course_cluster' 'employed' 'ft_employment'
  'gross median salary']
 ['2017' 'Arts' '91.40' '65.60' '2944']
 ['2017' 'Architecture' '92.70' '86.50' '3200']
 ['2017' 'Business' '95.40' '89.50' '3200']
 ['2017' 'Dentistry' '100' '100.00' '4050']
 ['2017' 'Education' '100' '100.00' '3600']
 ['2017' 'Engineering' '86.70' '79.50' '3500']
 ['2017' 'Health Sciences' '96.70' '93.70' '3450']
 ['2017' 'Arts' '85.70' '70.10' '3300']
 ['2017' 'IT' '94.60' '90.10' '4000']
 ['2017' 'Arts' '73.30' '26.70' '2225']
 ['2017' 'Sciences' '82.60' '65.30' '3250']
 ['2017' 'Architecture' '91.30' '86.40' '4000']
 ['2017' 'Health Sciences' '92.00' '80.00' '2950']
 ['2017' 'Law' '96.40' '92.80' '5000']
 ['2017' 'Medicine' '100.00' '100.00' '5000']
 ['2017' 'Health Sciences' '99.10' '94.50' '3600']
 ['2018' 'Arts' '89.10' '68.30' '3000']
 ['2018' 'Architecture' '91.60' '85.90' '3400']
 ['2018' 'Business' '94.40' '89.10' '3400']
 ['2018' 'Dentistry' '100.00' '100.00' '4050']
 ['2018' 'Education' '100.00

In [14]:
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/employment/employ.csv',employEdit,delimiter=",",fmt='%s')

Now, we check for missing values

In [15]:
# Read csv file
employArr = np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='float',delimiter=',',skip_header=1,usecols=(0,2,3,4))
col_header = np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',',usecols=(0,2,3,4))[0]
employCourse = np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',',skip_header=1,usecols=(1))
# Since we have to use isnan, setting names=True is not an option since the string cannot be set to float
# As such we make our own artificial columns using dictionary and 'col_header'
employNames = {}
for i,n in enumerate(col_header):
    employNames[n]=i

# Get all missing data
missing = np.argwhere(np.isnan(employArr))
for i in missing:
# Get col name by swapping key and values inside employNames
    print(f"course cluster: {employCourse[i[0]]}, year: {int(employArr[i[0]][0])}, column: {({v:k for k,v in employNames.items()})[i[1]]}")

course cluster: Medicine, year: 2018, column: employed
course cluster: Medicine, year: 2018, column: ft_employment
course cluster: Medicine, year: 2018, column: gross median salary
course cluster: Medicine, year: 2021, column: employed
course cluster: Medicine, year: 2021, column: ft_employment
course cluster: Medicine, year: 2021, column: gross median salary


As seen above, the missing data is for the course cluster "Medicine" and for year 2018 and 2021

Since:
1. The missing data is not randomly distributed (its just for medicine)
2. They are accounted for by other data in our datasets (Medicine data of salary and employment percentages from year 2017, 2019 and 2021 are likely to be similar to 2017,2019 and 2020)

The missing data is Missing At Random

*In the PDF by MOE, it is explained that the missing data is due to insufficient graduates/response rate*

Since the data is MAR, we can either choose to impute or remove the data

In [16]:
# Mean imputation

# Get data for medicine from 2017,2019 and 2021
dataForImpute = np.array([])
for i,j in zip(employArr,employCourse):
    if i[employNames["year"]] in [2017,2019,2020] and j=='Medicine':
        dataForImpute = np.concatenate((dataForImpute,i[1:]))

print(dataForImpute)

ImputeData = np.zeros((3,3))

# Reformat data(by grouping similar cols together) so that np.mean() can be used
for iter in range(3):
    ImputeData[iter] = np.array([n for i,n in enumerate(dataForImpute) if i%3==iter])

# Replace nan values with mean
for i,n in enumerate(ImputeData):
    employArr[missing[0][0]][i+1] = round(n.mean(),2)
    employArr[missing[3][0]][i+1] = round(n.mean(),2)

print(employArr[missing[0][0]])
print(employArr[missing[3][0]])

# If imputation was done correctly, the value of this should be 0
print(len(np.argwhere(np.isnan(employArr))))

[ 100.   100.  5000.    99.6   99.6 5300.   100.   100.  5250. ]
[2018.     99.87   99.87 5183.33]
[2021.     99.87   99.87 5183.33]
0


In [17]:
# Combine everything back together
tmp = np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',')[0]
employ = np.vstack((tmp, np.column_stack((employArr[:,0],employCourse,employArr[:,1:]))))
print(employ)

[['year' 'course_cluster' 'employed' 'ft_employment'
  'gross median salary']
 ['2017.0' 'Arts' '91.4' '65.6' '2944.0']
 ['2017.0' 'Architecture' '92.7' '86.5' '3200.0']
 ['2017.0' 'Business' '95.4' '89.5' '3200.0']
 ['2017.0' 'Dentistry' '100.0' '100.0' '4050.0']
 ['2017.0' 'Education' '100.0' '100.0' '3600.0']
 ['2017.0' 'Engineering' '86.7' '79.5' '3500.0']
 ['2017.0' 'Health Sciences' '96.7' '93.7' '3450.0']
 ['2017.0' 'Arts' '85.7' '70.1' '3300.0']
 ['2017.0' 'IT' '94.6' '90.1' '4000.0']
 ['2017.0' 'Arts' '73.3' '26.7' '2225.0']
 ['2017.0' 'Sciences' '82.6' '65.3' '3250.0']
 ['2017.0' 'Architecture' '91.3' '86.4' '4000.0']
 ['2017.0' 'Health Sciences' '92.0' '80.0' '2950.0']
 ['2017.0' 'Law' '96.4' '92.8' '5000.0']
 ['2017.0' 'Medicine' '100.0' '100.0' '5000.0']
 ['2017.0' 'Health Sciences' '99.1' '94.5' '3600.0']
 ['2018.0' 'Arts' '89.1' '68.3' '3000.0']
 ['2018.0' 'Architecture' '91.6' '85.9' '3400.0']
 ['2018.0' 'Business' '94.4' '89.1' '3400.0']
 ['2018.0' 'Dentistry' '100.0' 

In [18]:
# Combine intakes,enrollment and graduation data for courses that were group similarly

# Start by getting a unique set of all the courses
unique_courses = np.unique(employCourse)

# Amount of rows we should have at the end (+1 for header)
print(len(unique_courses)*(2022-2017))

# Add the data to the dictionary
for year in range(2017,2022):
    
    # Create a dictionary to store the data
    course_dict = {}
    for course in unique_courses:
        course_dict[course] = []

    for i,a in enumerate(employ):
        if a[1] in course_dict and float(a[0])==year:
            course_dict[a[1]].append(i)

    for k,v in course_dict.items():
        if len(v)>1:
            print(k,v)
            for j,n in enumerate(v):
                # Combine the data into one numpy array
                if j>0:
                    print(employ[n],j)
                    employ[v[0]][2] = str(float(employ[v[0]][2])+float(employ[n][2]))
                    employ[v[0]][3] = str(float(employ[v[0]][3])+float(employ[n][3]))
                    employ[v[0]][4] = str(float(employ[v[0]][4])+float(employ[n][4]))


                    # Remove the duplicates later
                    employ[n] = np.array(['NA','NA','NA','NA','NA'])
            
            # Get average
            employ[v[0]][2] = str(round(float(employ[v[0]][2])/len(v),2))
            employ[v[0]][3] = str(round(float(employ[v[0]][3])/len(v),2))
            employ[v[0]][4] = str(round(float(employ[v[0]][4])/len(v),0))
            
            for n in v:
                print(employ[n])
    year+=1

55
Architecture [2, 12]
['2017.0' 'Architecture' '91.3' '86.4' '4000.0'] 1
['2017.0' 'Architecture' '92.0' '86.45' '3600.0']
['NA' 'NA' 'NA' 'NA' 'NA']
Arts [1, 8, 10]
['2017.0' 'Arts' '85.7' '70.1' '3300.0'] 1
['2017.0' 'Arts' '73.3' '26.7' '2225.0'] 2
['2017.0' 'Arts' '83.47' '54.13' '2823.0']
['NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA']
Health Sciences [7, 13, 16]
['2017.0' 'Health Sciences' '92.0' '80.0' '2950.0'] 1
['2017.0' 'Health Sciences' '99.1' '94.5' '3600.0'] 2
['2017.0' 'Health Sciences' '95.93' '89.4' '3333.0']
['NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA']
Architecture [18, 28]
['2018.0' 'Architecture' '96.2' '92.4' '4000.0'] 1
['2018.0' 'Architecture' '93.9' '89.15' '3700.0']
['NA' 'NA' 'NA' 'NA' 'NA']
Arts [17, 24, 26]
['2018.0' 'Arts' '87.3' '72.1' '3400.0'] 1
['2018.0' 'Arts' '81.0' '23.8' '1800.0'] 2
['2018.0' 'Arts' '85.8' '54.73' '2733.0']
['NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA']
Health Sciences [23, 29, 32]
['2018.0' 'Health Sciences'

In [19]:
# Remove all NA
# employ[:,0]!='NA' checks the first col in every row (returns True if not NA)
employ = employ[employ[:,0]!='NA']


In [20]:
# format first and last columns as ints
for i,a in enumerate(employ):
    if i>0:
        employ[i][0] = str(int(float(a[0])))
        employ[i][4] = str(int(float(a[4])))
print(employ)

[['year' 'course_cluster' 'employed' 'ft_employment'
  'gross median salary']
 ['2017' 'Arts' '83.47' '54.13' '2823']
 ['2017' 'Architecture' '92.0' '86.45' '3600']
 ['2017' 'Business' '95.4' '89.5' '3200']
 ['2017' 'Dentistry' '100.0' '100.0' '4050']
 ['2017' 'Education' '100.0' '100.0' '3600']
 ['2017' 'Engineering' '86.7' '79.5' '3500']
 ['2017' 'Health Sciences' '95.93' '89.4' '3333']
 ['2017' 'IT' '94.6' '90.1' '4000']
 ['2017' 'Sciences' '82.6' '65.3' '3250']
 ['2017' 'Law' '96.4' '92.8' '5000']
 ['2017' 'Medicine' '100.0' '100.0' '5000']
 ['2018' 'Arts' '85.8' '54.73' '2733']
 ['2018' 'Architecture' '93.9' '89.15' '3700']
 ['2018' 'Business' '94.4' '89.1' '3400']
 ['2018' 'Dentistry' '100.0' '100.0' '4050']
 ['2018' 'Education' '100.0' '99.4' '3800']
 ['2018' 'Engineering' '89.9' '83.8' '3600']
 ['2018' 'Health Sciences' '94.73' '88.13' '3367']
 ['2018' 'IT' '95.0' '92.0' '4022']
 ['2018' 'Sciences' '84.4' '68.8' '3313']
 ['2018' 'Law' '95.3' '91.8' '5000']
 ['2018' 'Medicine' '

In [21]:
# Write back for one last time
np.savetxt('datasets_cleaned/employment/employ.csv',employ,delimiter=",",fmt='%s')

### Intake

Similarly for intake, we edit the column names to fit column names for Employment dataset

#### *Poly*

|       Initial Col Name   | New Col Name |
|----------------------|------|
| Engineering Sciences | Engineering |
| Architecture, Building & Real Estate | Architecture|
| Business & Administration | Business |
| Information Technology | IT |
| Applied Arts | Arts |
| Mass Communication | Arts |
| Services | Arts |
| Humanities & Social Sciences | Arts |
| Health Sciences | Health Sciences |
| Education | Education |
| Natural & Mathematical Sciences | Sciences |
Natural, Physical & Mathematical Sciences | Sciences |
| Law | Law |

In [22]:
polyEdit = np.genfromtxt('datasets_cleaned/poly/poly_intake.csv',dtype='U64',delimiter=',',skip_header=1)
for a in polyEdit:
    if 'Engineering' in a[2]:
        a[2] = 'Engineering'
    elif 'Architecture' in a[2]:
        a[2] = 'Architecture'
    elif 'Business' in a[2]:
        a[2] = 'Business'
    elif 'Information Technology' in a[2]:
        a[2] = 'IT'
    elif a[2] == 'Applied Arts' or a[2] == 'Mass Communication' or a[2]=='Services' or 'Humanities' in a[2]:
        a[2] = 'Arts'
    elif 'Sciences' in a[2] and a[2]!= 'Health Sciences':
        a[2] = 'Sciences'

Inside the dataset, the genders are either:
- Both Male and Female or
- Female Only

To make it easier to compare between different genders, we can take the Both Male and Female row minus the Female Row for the same year and course

From 2018 (inclusive) and before, the data is stored as follows:
- MF, course1
- F, course1
- MF, course2
- F, course2

After 2018, 
- MF, course1
- MF, course2
- F, course1
- F, course2

As such we would need two seperate ways to do this. One for the years for 2018 and prior and another for after 2018

In [23]:
for i,a in enumerate(polyEdit):
    # 2018 and before
    if int(a[0])<=2018:
        if a[1]=='MF':
            a[1]='M'
            a[3] = str(int(a[3])-int(polyEdit[i+1][3]))
            a[4] = str(int(a[4])-int(polyEdit[i+1][4]))
            a[5] = str(int(a[5])-int(polyEdit[i+1][5]))
    # 2019 and 2020
    else:
        if a[1]=='MF':
            a[1]='M'
            # Since there are 12 different courses
            a[3] = str(int(a[3])-int(polyEdit[i+12][3]))
            a[4] = str(int(a[4])-int(polyEdit[i+12][4]))
            a[5] = str(int(a[5])-int(polyEdit[i+12][5]))


# Reformat data for 2019 and 2020 (to make it similar to 2018 and before)
counter = 0
for i in range(len(polyEdit)):
    if int(polyEdit[i][0])==2019:
        if counter<12:
            # We have to use vstack instead of concat or append since we are adding a 1d array to a 2d array (and since axis=0)
            polyEdit = np.vstack((polyEdit[:i+1+counter],polyEdit[i+12+counter],polyEdit[i+1+counter:]))
        # remove duplicates from 2019
        elif counter==24:
            polyEdit = np.delete(polyEdit,np.s_[i:i+12],0)
        counter+=1

counter = 0
for i in range(len(polyEdit)):
    if int(polyEdit[i][0])==2020:
        if counter<12:
            polyEdit = np.vstack((polyEdit[:i+1+counter],polyEdit[i+12+counter],polyEdit[i+1+counter:]))
        else:
            print(polyEdit[i],counter)
        counter+=1

# since the last 12 rows are duplicates, we remove them
polyEdit = np.delete(polyEdit,np.s_[-12:],0)

            

['2020' 'M' 'Arts' '70' '211' '89'] 12
['2020' 'F' 'Arts' '228' '698' '259'] 13
['2020' 'M' 'IT' '1883' '5824' '1808'] 14
['2020' 'F' 'IT' '637' '2173' '802'] 15
['2020' 'M' 'Law' '48' '118' '31'] 16
['2020' 'F' 'Law' '60' '210' '62'] 17
['2020' 'M' 'Arts' '134' '458' '159'] 18
['2020' 'F' 'Arts' '415' '1334' '461'] 19
['2020' 'M' 'Sciences' '440' '1090' '471'] 20
['2020' 'F' 'Sciences' '669' '1947' '730'] 21
['2020' 'M' 'Arts' '416' '1706' '526'] 22
['2020' 'F' 'Arts' '375' '1222' '482'] 23


In [24]:
for a in polyEdit:
    print(a)

['2005' 'M' 'Arts' '441' '1055' '248']
['2005' 'F' 'Arts' '687' '1538' '302']
['2005' 'M' 'Architecture' '203' '596' '176']
['2005' 'F' 'Architecture' '312' '870' '249']
['2005' 'M' 'Business' '1094' '3105' '774']
['2005' 'F' 'Business' '2389' '7038' '2270']
['2005' 'M' 'Education' '9' '15' '0']
['2005' 'F' 'Education' '180' '469' '111']
['2005' 'M' 'Engineering' '5729' '16523' '4531']
['2005' 'F' 'Engineering' '2097' '5939' '2005']
['2005' 'M' 'Health Sciences' '313' '991' '139']
['2005' 'F' 'Health Sciences' '1326' '3971' '877']
['2005' 'M' 'Arts' '10' '10' '0']
['2005' 'F' 'Arts' '71' '73' '0']
['2005' 'M' 'IT' '2235' '6542' '1892']
['2005' 'F' 'IT' '1887' '5065' '1464']
['2005' 'M' 'Law' '43' '120' '31']
['2005' 'F' 'Law' '83' '221' '71']
['2005' 'M' 'Arts' '124' '397' '137']
['2005' 'F' 'Arts' '324' '1029' '282']
['2005' 'M' 'Sciences' '497' '1166' '321']
['2005' 'F' 'Sciences' '712' '1678' '447']
['2005' 'M' 'Arts' '72' '296' '153']
['2005' 'F' 'Arts' '68' '173' '35']
['2006' 'M'

In [25]:
# Number of rows (+1 for header) we should get
# The reason for -2 is because poly has no 'Medicine' and 'Dentistry'
print((len(unique_courses)-2)*2*(2021-2005))

288


In [26]:
# Add the data to the dictionary
for year in range(2005,2021):
    
    # Create a dictionary to store the data
    course_dict = {}
    for course in unique_courses:
        course_dict[course] = []

    for i,a in enumerate(polyEdit):
        if a[2] in course_dict and float(a[0])==year:
            if a[1]=='M':
                course_dict[a[2]].append(i)


    for k,v in course_dict.items():
        if len(v)>1:
            print(k,v)
            for j,n in enumerate(v):
                # Combine the data into one numpy array
                if j>0:
                    print(polyEdit[n],j)

                    # For males
                    polyEdit[v[0]][3] = str(int(polyEdit[v[0]][3])+int(polyEdit[n][3]))
                    polyEdit[v[0]][4] = str(int(polyEdit[v[0]][4])+int(polyEdit[n][4]))
                    polyEdit[v[0]][5] = str(int(polyEdit[v[0]][5])+int(polyEdit[n][5]))

                    # Repeat for Females
                    polyEdit[v[0]+1][3] = str(int(polyEdit[v[0]+1][3])+int(polyEdit[n+1][3]))
                    polyEdit[v[0]+1][4] = str(int(polyEdit[v[0]+1][4])+int(polyEdit[n+1][4]))
                    polyEdit[v[0]+1][5] = str(int(polyEdit[v[0]+1][5])+int(polyEdit[n+1][5]))


                    # Remove the duplicates later
                    polyEdit[n] = np.array(['NA','NA','NA','NA','NA','NA'])
                    polyEdit[n+1] = np.array(['NA','NA','NA','NA','NA','NA'])
            
            # No need to get average since these are enrollment, intake and grad numbers (not percentages like before)

            
            for n in v:
                print(polyEdit[n])
                print(polyEdit[n+1])
    year+=1

Arts [0, 12, 18, 22]
['2005' 'M' 'Arts' '10' '10' '0'] 1
['2005' 'M' 'Arts' '124' '397' '137'] 2
['2005' 'M' 'Arts' '72' '296' '153'] 3
['2005' 'M' 'Arts' '647' '1758' '538']
['2005' 'F' 'Arts' '1150' '2813' '619']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
Arts [24, 36, 42, 46]
['2006' 'M' 'Arts' '9' '17' '0'] 1
['2006' 'M' 'Arts' '140' '409' '120'] 2
['2006' 'M' 'Arts' '115' '321' '116'] 3
['2006' 'M' 'Arts' '822' '2108' '459']
['2006' 'F' 'Arts' '1212' '3260' '665']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
Arts [48, 60, 66, 70]
['2007' 'M' 'Arts' '23' '40' '0'] 1
['2007' 'M' 'Arts' '176' '439' '135'] 2
['2007' 'M' 'Arts' '154' '380' '113'] 3
['2007' 'M' 'Arts' '943' '2439' '546']
['2007' 

In [27]:
# Add back header
polyEdit = np.vstack((np.genfromtxt('datasets_cleaned/poly/poly_intake.csv',dtype='U64',delimiter=',')[0],polyEdit))
# Remove all NA
polyEdit = polyEdit[polyEdit[:,0]!='NA']
print(polyEdit)

[['year' 'sex' 'course' 'intake' 'enrolment' 'graduates']
 ['2005' 'M' 'Arts' '647' '1758' '538']
 ['2005' 'F' 'Arts' '1150' '2813' '619']
 ...
 ['2020' 'F' 'Law' '60' '210' '62']
 ['2020' 'M' 'Sciences' '440' '1090' '471']
 ['2020' 'F' 'Sciences' '669' '1947' '730']]


In [28]:
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/poly/poly_intake.csv',polyEdit,delimiter=",",fmt='%s')

#### *Uni*

Not much explanation to be done since its identical to what was done for the poly datasets

|       Initial Col Name   | New Col Name |
|----------------------|------|
| Engineering Sciences | Engineering |
| Architecture, Building & Real Estate | Architecture|
| Business & Administration | Business |
| Accountancy | Business |
| Information Technology | IT|
| Fine & Applied Arts | Arts|
| Mass Communication | Arts|
| Services | Arts|
| Humanities & Social Sciences | Arts|
| Medicine | Medicine |
| Dentistry | Dentistry |
| Health Sciences | Health Sciences |
| Education | Education |
| Natural & Mathematical Sciences | Sciences |
Natural, Physical & Mathematical Sciences | Sciences |
| Law | Law |

In [29]:
uniEdit = np.genfromtxt('datasets_cleaned/uni/uni_intake.csv',dtype='U64',delimiter=',',skip_header=1)
for a in uniEdit:
    if 'Engineering' in a[2]:
        a[2] = 'Engineering'
    elif 'Architecture' in a[2]:
        a[2] = 'Architecture'
    elif 'Business' in a[2] or a[2] == 'Accountancy':
        a[2] = 'Business'
    elif 'Information Technology' in a[2]:
        a[2] = 'IT'
    elif 'Applied Arts' in a[2] or a[2] == 'Mass Communication' or a[2]=='Services' or 'Humanities' in a[2]:
        a[2] = 'Arts'
    elif 'Sciences' in a[2] and a[2]!= 'Health Sciences':
        a[2] = 'Sciences'

In [30]:
for i,a in enumerate(uniEdit):
    if int(a[0])<=2018:
        if a[1]=='MF':
            a[1]='M'
            a[3] = str(int(a[3])-int(uniEdit[i+1][3]))
            a[4] = str(int(a[4])-int(uniEdit[i+1][4]))
            a[5] = str(int(a[5])-int(uniEdit[i+1][5]))
    else:
        if a[1]=='MF':
            a[1]='M'
            # Since there are 12 different courses
            a[3] = str(int(a[3])-int(uniEdit[i+15][3]))
            a[4] = str(int(a[4])-int(uniEdit[i+15][4]))
            a[5] = str(int(a[5])-int(uniEdit[i+15][5]))

# Reformat data for 2019 and 2020
counter = 0
for i in range(len(uniEdit)):
    if int(uniEdit[i][0])==2019:
        if counter<15:
            # We have to use vstack instead of concat or append since we are adding a 1d array to a 2d array (and since axis=0)
            uniEdit = np.vstack((uniEdit[:i+1+counter],uniEdit[i+15+counter],uniEdit[i+1+counter:]))
        # remove duplicates from 2019
        elif counter==30:
            uniEdit = np.delete(uniEdit,np.s_[i:i+15],0)
        counter+=1

counter = 0
for i in range(len(uniEdit)):
    if int(uniEdit[i][0])==2020:
        if counter<15:
            uniEdit = np.vstack((uniEdit[:i+1+counter],uniEdit[i+15+counter],uniEdit[i+1+counter:]))
        else:
            print(uniEdit[i],counter)
        counter+=1

# since the last 12 rows are duplicates, we remove them
uniEdit = np.delete(uniEdit,np.s_[-15:],0)

['2020' 'F' 'Health Sciences' '761' '2455' '522'] 15
['2020' 'M' 'Arts' '929' '4109' '1022'] 16
['2020' 'F' 'Arts' '2001' '8783' '2125'] 17
['2020' 'M' 'IT' '2017' '6014' '970'] 18
['2020' 'F' 'IT' '846' '2797' '397'] 19
['2020' 'M' 'Law' '204' '916' '190'] 20
['2020' 'F' 'Law' '259' '907' '204'] 21
['2020' 'M' 'Arts' '28' '148' '39'] 22
['2020' 'F' 'Arts' '142' '566' '142'] 23
['2020' 'M' 'Medicine' '253' '1158' '208'] 24
['2020' 'F' 'Medicine' '224' '1020' '178'] 25
['2020' 'M' 'Sciences' '788' '3089' '801'] 26
['2020' 'F' 'Sciences' '891' '3624' '1127'] 27
['2020' 'M' 'Arts' '143' '402' '107'] 28
['2020' 'F' 'Arts' '139' '412' '112'] 29


In [31]:
# Number of rows (+1 for header) we should get
print((len(unique_courses))*2*(2021-2005))

352


In [32]:
# Add the data to the dictionary
for year in range(2005,2021):
    
    # Create a dictionary to store the data
    course_dict = {}
    for course in unique_courses:
        course_dict[course] = []

    for i,a in enumerate(uniEdit):
        if a[2] in course_dict and float(a[0])==year:
            if a[1]=='M':
                course_dict[a[2]].append(i)


    for k,v in course_dict.items():
        if len(v)>1:
            print(k,v)
            for j,n in enumerate(v):
                # Combine the data into one numpy array
                if j>0:
                    print(uniEdit[n],j)

                    # For males
                    uniEdit[v[0]][3] = str(int(uniEdit[v[0]][3])+int(uniEdit[n][3]))
                    uniEdit[v[0]][4] = str(int(uniEdit[v[0]][4])+int(uniEdit[n][4]))
                    uniEdit[v[0]][5] = str(int(uniEdit[v[0]][5])+int(uniEdit[n][5]))

                    # Repeat for Females
                    uniEdit[v[0]+1][3] = str(int(uniEdit[v[0]+1][3])+int(uniEdit[n+1][3]))
                    uniEdit[v[0]+1][4] = str(int(uniEdit[v[0]+1][4])+int(uniEdit[n+1][4]))
                    uniEdit[v[0]+1][5] = str(int(uniEdit[v[0]+1][5])+int(uniEdit[n+1][5]))


                    # Remove the duplicates later
                    uniEdit[n] = np.array(['NA','NA','NA','NA','NA','NA'])
                    uniEdit[n+1] = np.array(['NA','NA','NA','NA','NA','NA'])
            
            # No need to get average since these are enrollment, intake and grad numbers (not percentages like before)

            
            for n in v:
                print(uniEdit[n])
                print(uniEdit[n+1])
    year+=1

Arts [12, 16, 22, 28]
['2005' 'M' 'Arts' '589' '1944' '401'] 1
['2005' 'M' 'Arts' '42' '137' '36'] 2
['2005' 'M' 'Arts' '18' '28' '0'] 3
['2005' 'M' 'Arts' '709' '2252' '448']
['2005' 'F' 'Arts' '1915' '5598' '1370']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
Business [0, 4]
['2005' 'M' 'Business' '644' '1915' '358'] 1
['2005' 'M' 'Business' '990' '2744' '569']
['2005' 'F' 'Business' '1431' '4830' '1393']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
Arts [42, 46, 52, 58]
['2006' 'M' 'Arts' '760' '2250' '460'] 1
['2006' 'M' 'Arts' '27' '130' '33'] 2
['2006' 'M' 'Arts' '32' '60' '0'] 3
['2006' 'M' 'Arts' '915' '2663' '504']
['2006' 'F' 'Arts' '2139' '6288' '1419']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
['NA' 'NA' 'NA' 'NA' 'NA' 'NA']
[

In [33]:
# Add back header
uniEdit = np.vstack((np.genfromtxt('datasets_cleaned/uni/uni_intake.csv',dtype='U64',delimiter=',')[0],uniEdit))
# Remove all NA
uniEdit = uniEdit[uniEdit[:,0]!='NA']
print(uniEdit)

[['year' 'sex' 'course' 'intake' 'enrolment' 'graduates']
 ['2005' 'M' 'Business' '990' '2744' '569']
 ['2005' 'F' 'Business' '1431' '4830' '1393']
 ...
 ['2020' 'F' 'Medicine' '224' '1020' '178']
 ['2020' 'M' 'Sciences' '788' '3089' '801']
 ['2020' 'F' 'Sciences' '891' '3624' '1127']]


In [34]:
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/uni/uni_intake.csv',uniEdit,delimiter=",",fmt='%s')