Done By: **Ryan Yeo**

Class: **DAAA/FT/1B/01**

Admin Num: **P2214452**

In [1]:
import numpy as np
import os

# Cleaning datasets

In [2]:
# Create directory if it exists print err
try:
    os.mkdir('datasets_cleaned')
except OSError as error:
    print(error)  

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned'


### Employment

In [3]:
# Clean employment dataset

# Note:
# We cannot use genfromtxt directly because it reads commas contained in strings(unlike csvreader and pd.read_csv) 
# To avoid that, we first read in all the data seperated by a newline before processing it

dirtyData_17to19 = np.genfromtxt('datasets_src/employment/emp_17to19.csv', dtype="U64",delimiter="\n")
dirtyData_19to21 = np.genfromtxt('datasets_src/employment/emp_19to21.csv', dtype="U64",delimiter="\n")

In [4]:
# Convert and clean the dataset so that it can be written to datasets_cleaned
def cleanData(dirtyArr):
    employ_arr = []
    for i in dirtyArr:
        _ = []
        inQuotes = False
        for j,n in enumerate(i):
            if n=='\"':
                # If opening quotes => True elif closing quotes => False
                inQuotes=not inQuotes
            if n==',' and inQuotes:
                # If it's used in a string, change it to a backtick
                # This is for the sole purpose of not causing any error when reading as csv
                # When printing from this column, backticks will be changed back to commas
                if i[j+1]==' ':
                    _.append('`')
                # If it's used in money, (e.g. $3,600) just remove the comma
                else:
                    _.append('')
            else:
                _.append(n)
        employ_arr.append("".join(_))
    employ_arr = np.array(employ_arr)
    return employ_arr


In [5]:
# We can delete one set of 2019 data since we don't need duplicates from both arr
cleaned_17to19 = cleanData(dirtyData_17to19)[:-((len(cleanData(dirtyData_17to19))-1)//3)]

# We can also delete the header for the second arr
cleaned_19to21 = cleanData(dirtyData_19to21)[1:]

cleaned = np.concatenate((cleaned_17to19,cleaned_19to21))

In [6]:
# Now we can write back the data into datasets_cleaned
try:
    os.mkdir('datasets_cleaned/employment')
except OSError as error:
    print(error)  

cleaned.tofile('datasets_cleaned/employment/employ.csv',sep='\n')

[WinError 183] Cannot create a file when that file already exists: 'datasets_cleaned/employment'


In [None]:
# However at this point our data is still not in the right format yet

# Due to our need to manipulate and remove commas that were not seperators, 
# we had to cast each row as a string datatype

# When writing to a csv file, it will cause quotation marks to appear for each row
# Since we don't want that to affect main.ipynb, we would have to reopen the file the format it

# This time since we changed the commas, it would be less of a hassle


In [7]:
f=np.genfromtxt('datasets_cleaned/employment/employ.csv',dtype='U64',delimiter=',')
# Reformat array
f = np.char.replace(f,'\'','')  
f = np.char.replace(f,'\"','')

# We can also remove the '$' and the '%' in the meantime so that we can easily convert into float later
f = np.char.replace(f,'$','')
f = np.char.replace(f,'%','')


# Delete file so that savetxt does not replace chars
os.remove('datasets_cleaned/employment/employ.csv')
# Write it back to csv in the right format now
np.savetxt('datasets_cleaned/employment/employ.csv',f,delimiter=",",fmt='%s')

In [None]:
# Ngl it took me 6 hours to come up with this way of cleaning of data 
# without using pd or csv but it was definitely a fun process

### Intake
_This includes ITE, poly and Uni_

In [10]:
dirtyPoly = np.genfromtxt('datasets_src/poly_intake/polytechnics-intake-enrolment-and-graduates-by-course.csv',dtype='U128',delimiter='\n')

In [11]:
print(dirtyPoly)

['year,sex,course,intake,enrolment,graduates'
 '2005,MF,Applied Arts,1128,2593,550' '2005,F,Applied Arts,687,1538,302'
 '2005,MF,"Architecture, Building & Real Estate",515,1466,425'
 '2005,F,"Architecture, Building & Real Estate",312,870,249'
 '2005,MF,Business & Administration,3483,10143,3044'
 '2005,F,Business & Administration,2389,7038,2270'
 '2005,MF,Education,189,484,111' '2005,F,Education,180,469,111'
 '2005,MF,Engineering Sciences,7826,22462,6536'
 '2005,F,Engineering Sciences,2097,5939,2005'
 '2005,MF,Health Sciences,1639,4962,1016'
 '2005,F,Health Sciences,1326,3971,877'
 '2005,MF,Humanities & Social Sciences,81,83,0'
 '2005,F,Humanities & Social Sciences,71,73,0'
 '2005,MF,Information Technology,4122,11607,3356'
 '2005,F,Information Technology,1887,5065,1464' '2005,MF,Law,126,341,102'
 '2005,F,Law,83,221,71' '2005,MF,Mass Communication,448,1426,419'
 '2005,F,Mass Communication,324,1029,282'
 '2005,MF,"Natural, Physical & Mathematical Sciences",1209,2844,768'
 '2005,F,"Natural