Done By: **Ryan Yeo**

Class: **DAAA/FT/1B/01**

Admin Num: **P2214452**

In [1]:
import numpy as np
import os

# Cleaning datasets

In [2]:
# Create directory if it exists print err
try:
    os.mkdir('datasets_cleaned')
except OSError as error:
    print(error)  

### Employment

In [3]:
# Clean employment dataset

# Note:
# We cannot use genfromtxt directly because it reads commas contained in strings(unlike csvreader and pd.read_csv) 
# To avoid that, we first read in all the data seperated by a newline before processing it

dirtyData_17to19 = np.genfromtxt('datasets_src/employment/emp_17to19.csv', dtype="U64",delimiter="\n")
dirtyData_19to21 = np.genfromtxt('datasets_src/employment/emp_19to21.csv', dtype="U64",delimiter="\n")

In [4]:
# Convert and clean the dataset so that it can be written to datasets_cleaned
def cleanData(dirtyArr):
    employ_arr = []
    for i in dirtyArr:
        _ = []
        inQuotes = False
        for j,n in enumerate(i):
            if n=='\"':
                # If opening quotes => True elif closing quotes => False
                inQuotes=not inQuotes
            if n==',' and inQuotes:
                # If it's used in a string, change it to a backtick
                # This is for the sole purpose of not causing any error when reading as csv
                # When printing from this column, backticks will be changed back to commas
                if i[j+1]==' ':
                    _.append('`')
                # If it's used in money, (e.g. $3,600) just remove the comma
                else:
                    _.append('')
            else:
                _.append(n)
        employ_arr.append("".join(_))
    employ_arr = np.array(employ_arr)
    return employ_arr


In [5]:
# We can delete one set of 2019 data since we don't need duplicates from both arr
cleaned_17to19 = cleanData(dirtyData_17to19)[:-((len(cleanData(dirtyData_17to19))-1)//3)]

# We can also delete the header for the second arr
cleaned_19to21 = cleanData(dirtyData_19to21)[1:]

cleaned = np.concatenate((cleaned_17to19,cleaned_19to21))

In [6]:
# Now we can write back the data into datasets_cleaned
try:
    os.mkdir('datasets_cleaned/employment')
except OSError as error:
    print(error)  

cleaned.tofile('datasets_cleaned/employment/employ.csv',sep='\n')

However at this point our data is still not in the right format yet

Due to our need to manipulate and remove commas that were not seperators, 
we had to cast each row as a string datatype

When writing to a csv file, it will cause quotation marks to appear for each row
Since we don't want that to affect main.ipynb, we would have to reopen the file the format it

This time since we changed the commas, it would be less of a hassle


In [7]:
f=np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# We can also remove the '$' and the '%' in the meantime so that we can easily convert into float later
f = np.char.replace(f,'$','')
f = np.char.replace(f,'%','')


# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/employment/employ.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/employment/employ.csv',f,delimiter=",",fmt='%s')

### Intake
Now we repeat the entire process again for the different intakes (_This includes ITE, poly and Uni_)

In [22]:
# Prior to this, U64 was used and it accidentally cut off data from Poly Intake
dirtyITE = np.genfromtxt('datasets_src/ite_intake/intake-count-of-full-time-and-traineeship-programmes-at-ite.csv', dtype="U128",delimiter="\n")
dirtyPoly = np.genfromtxt('datasets_src/poly_intake/polytechnics-intake-enrolment-and-graduates-by-course.csv', dtype="U128",delimiter="\n")
dirtyUni = np.genfromtxt('datasets_src/uni_intake/universities-intake-enrolment-and-graduates-by-course.csv',dtype="U128",delimiter="\n")

In [24]:
# Since there is no salary data, we are just replacing commas with backticks using cleanData
cleanITE = cleanData(dirtyITE)
cleanPoly = cleanData(dirtyPoly)
cleanUni = cleanData(dirtyUni)

In [25]:
# ITE
try:
    os.mkdir('datasets_cleaned/ite')
except OSError as error:
    print(error)  

cleanITE.tofile('datasets_cleaned/ite/ite_intake.csv',sep='\n')

f=np.genfromtxt('datasets_cleaned/ite/ite_intake.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/ite/ite_intake.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/ite/ite_intake.csv',f,delimiter=",",fmt='%s')

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned/ite'


In [28]:
# Poly
try:
    os.mkdir('datasets_cleaned/poly')
except OSError as error:
    print(error)  

cleanPoly.tofile('datasets_cleaned/poly/poly_intake.csv',sep='\n')

# Since now each string is one cell of data and not one row, we can go back to using U64
f=np.genfromtxt('datasets_cleaned/poly/poly_intake.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/poly/poly_intake.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/poly/poly_intake.csv',f,delimiter=",",fmt='%s')

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned/poly'


In [29]:
# Uni
try:
    os.mkdir('datasets_cleaned/uni')
except OSError as error:
    print(error)  

cleanUni.tofile('datasets_cleaned/uni/uni_intake.csv',sep='\n')

f=np.genfromtxt('datasets_cleaned/uni/uni_intake.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/uni/uni_intake.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/uni/uni_intake.csv',f,delimiter=",",fmt='%s')

In [31]:
# TODO: Categorise data based on course_clusters
# Combine all three intakes into 1
# For ITE, use Z score to compare between ITE diploma, Higher NITEC etc.


# Impute Missing Data

### Employment

In [10]:
# Read csv file
employArr = np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='float',delimiter=',',skip_header=1,usecols=(0,2,3,4))
label = np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',',skip_header=1,usecols=(1))

# Get all missing data
missing = np.argwhere(np.isnan(employArr))
for i in missing:
    print(f"course cluster: {label[i[0]]}, year: {int(employArr[i[0]][0])}, column: {i[1]}")

course cluster: Medicine, year: 2018, column: 1
course cluster: Medicine, year: 2018, column: 2
course cluster: Medicine, year: 2018, column: 3
course cluster: Medicine, year: 2021, column: 1
course cluster: Medicine, year: 2021, column: 2
course cluster: Medicine, year: 2021, column: 3


As seen above, the missing data is for the course cluster "Medicine" and for year 2018 and 2021

Since:
1. The missing data is not randomly distributed (its just for medicine)
2. They are accounted for by other data in our datasets (Medicine data of salary and employment percentages from year 2017, 2019 and 2021 are likely to be similar to 2017,2019 and 2020)

The missing data is Missing At Random

*In the PDF by MOE, it is explained that the missing data is due to insufficient graduates/response rate*

Since the data is MAR, we can either choose to impute or remove the data

In [53]:
# Mean imputation

# Get data for medicine from 2017,2019 and 2021
dataForImpute = np.array([])
for i,j in zip(employArr,label):
    if i[0] in [2017,2019,2020] and j=='Medicine':
        dataForImpute = np.concatenate((dataForImpute,i[1:]))

print(dataForImpute)

ImputeData = np.zeros((3,3))

# Reformat data(by grouping similar cols together) so that np.mean() can be used
for iter in range(3):
    ImputeData[iter] = np.array([n for i,n in enumerate(dataForImpute) if i%3==iter])

# Replace nan values with mean
for i,n in enumerate(ImputeData):
    employArr[missing[0][0]][i+1] = round(n.mean(),2)
    employArr[missing[3][0]][i+1] = round(n.mean(),2)

print(employArr[missing[0][0]])
print(employArr[missing[3][0]])

# If imputation was done correctly, the value of this should be 0
print(len(np.argwhere(np.isnan(employArr))))

[ 100.   100.  5000.    99.6   99.6 5300.   100.   100.  5250. ]
[2018.     99.87   99.87 5183.33]
[2021.     99.87   99.87 5183.33]
0


### Intake