This repository has been archived by the owner on Nov 6, 2021. It is now read-only.
/
scrapper.sh
63 lines (61 loc) · 3.2 KB
/
scrapper.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#!/bin/bash
# This script scraps NUS IVLE site for module code,title,prereqs and precludes.
# Preconditions:
# 200MB free space in directory
# python installed and path configured
# Unix system with dos2unix, awk, sed and normal unix utilities present
# script.py in same directory
# Require a list of Modules in a txt file in same directory named modulelist.txt
#
#The script will create a .tsv file (tilde delimited values) of the module code, title,prereq and precludes in that order.
dos2unix modulelist.txt
awk '{print $1}' ./modulelist.txt > modulecode.txt #seperate module code
awk '{for (i = 2; i <= NF; i++) printf $i " ";printf "\n"}' ./modulelist.txt > modulename2.txt #seperate module name
mkdir ./temp #all processing in temp directory
cp ./script.py ./temp/script.py
cd ./temp
./script.py #wget module list
rm script.py
find ./ -type f -print0 | xargs -0 sed -i 's/<[^>]*>//g' #strip html tags
find ./ -type f -print0 | xargs -0 sed -i -e :a -e '$!N;s/\n//;ta' #remove all line breaks
cat * > txt #concat into a file
cp txt name # working file for parsing name
cp txt prereq
cp txt preclude
# parsing prereq
awk 'BEGIN{FS=OFS="Prereq"} NF>1{$1="";sub(/^- */, "")}'1 prereq > prereq2 #Remove everything before prereq for files with prereq
sed -i 's/^ .*//' prereq2 #Lines not starting whitespace = null
sed -i 's/Preclu.*//' prereq2 #remove everything after Preclu
sed -i 's/Cross-list.*//' prereq2 #remove everything after Cross-list
awk '{ sub(/[ \t]+$/, ""); print }' prereq2 > prereq #Remove trailing whitespace
awk '{ sub(/^Prerequisites/, ""); print }' prereq > finalprereq #Remove Prerequisites title
# parsing name
awk 'BEGIN{FS=OFS="Module Title"} NF>1{$1="";sub(/^- */, "")}'1 name > name2
sed -i 's/^ .*//' name2 #Lines not starting whitespace = null
sed -i 's/Module Cre.*//' name2
sed -i 's/Prereq.*//' name2
awk '{ sub(/[ \t]+$/, ""); print }' name2 > name #Remove trailing whitespace
awk '{ sub(/^Module Title/, ""); print }' name > finalname
# parsing preclude
awk 'BEGIN{FS=OFS="Preclusions"} NF>1{$1="";sub(/^- */, "")}'1 preclude > preclude2
sed -i 's/^ .*//' preclude2 #Lines not starting whitespace = null
sed -i 's/Cross-list.*//' preclude2
sed -i 's/Workload Comp.*//' preclude2
awk '{ sub(/[ \t]+$/, ""); print }' preclude2 > preclude #Remove trailing whitespace
awk '{ sub(/^Preclusions/, ""); print }' preclude > finalpreclude
paste -d~ finalcode finaltitle finalprereq finalpreclude > ../final.tsv
cd ..
rm -rf ./temp
awk '{ sub(/^/, "~"); print }' final.tsv > last.tsv //Add ~ in front #For GAE bulkloader you need a delimiter in front of first field as well