# **Full ETL for Banner.db**
Step-by-step from source files to RDBMS
- Study this for insights into Part 1. 
- Run before starting Part 2. It will replace your `Banner.db` file so everyone is starting from the same starting point. 
- You can rerun this anytime you like. It should recreate eerything from scratch. 

## Preliminaries: Extensions, Imports, and Database Connections

In [None]:
%%bash

# clone the DATA6510 course repo from GitHub
rm -rf DATA6510
git clone https://github.com/christopherhuntley/DATA6510

# keep just the banner data
rm -rf banner
mv DATA6510/data/banner .

# cleanup
rm -rf DATA6510

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create the DATA6510/data/FinalProject folder in Google Drive
from pathlib import Path

data_root = Path("./drive/My Drive/Colab Notebooks/DATA6510")
if not data_root.exists():
  print(
      '''
      Warning! The folder '/Colab Notebooks/DATA6510' could not be found in the connected Google Drive. 
      Please make 100% sure that both Colab and Chrome are set up use your @student.fairfield.edu account. 
      For now, a new folder with the correct path has been created in whatever Google Drive it found. 
      ''')
data_root = data_root / 'data' / 'FinalProject'
data_root.mkdir(parents=True, exist_ok=True)



In [None]:
%%bash
# create (or refresh) the symlink
rm -rf data6510
ln -s drive/My\ Drive/Colab\ Notebooks/DATA6510 data6510

# delete old copy of the database file
rm -rf data6510/data/FinalProject/Banner.db

In [None]:
# Load %%sql magic
%load_ext sql

# Standard Imports
import sqlite3
import pandas as pd

# Database connection
%sql sqlite:///data6510/data/FinalProject/Banner.db
conn = sqlite3.connect('data6510/data/FinalProject/Banner.db')

## 1. Create Tables from ERD

In [None]:
%%sql

-- Programs table
DROP TABLE IF EXISTS PROGRAMS;
CREATE TABLE PROGRAMS (
    ProgramID INTEGER PRIMARY KEY,
    ProgramCode TEXT NOT NULL,
    ProgramName TEXT NOT NULL
);

-- Course Catalogs table
DROP TABLE IF EXISTS CATALOG_COURSES;
CREATE TABLE CATALOG_COURSES (
    CourseID INTEGER PRIMARY KEY,
    CatalogYear TEXT NOT NULL,
    CatalogNum TEXT NOT NULL,
    ProgramID INTEGER,
    CourseTitle TEXT NOT NULL,
    Credits TEXT NOT NULL,
    Prereqs TEXT,
    Coreqs TEXT,
    Fees TEXT,
    Attributes TEXT,
    Description TEXT,
    FOREIGN KEY (ProgramID) REFERENCES PROGRAMS(ProgramID)
);
CREATE INDEX ix_catalog_courses_alt_key on CATALOG_COURSES(CatalogYear,CatalogNum);

-- Instructors table
DROP TABLE IF EXISTS INSTRUCTORS;
CREATE TABLE INSTRUCTORS (
    InstructorID INTEGER PRIMARY KEY,
    Name TEXT NOT NULL
);
CREATE INDEX ix_instructors_name on INSTRUCTORS(Name);

-- Course Offerings table
DROP TABLE IF EXISTS COURSE_OFFERINGS;
CREATE TABLE COURSE_OFFERINGS (
    CourseOfferingID INTEGER PRIMARY KEY,
    CourseID INTEGER,
    CatalogNum TEXT NOT NULL,
    Term TEXT NOT NULL,
    CRN INTEGER NOT NULL,
    Section TEXT NOT NULL,
    Credits REAL,
    Title TEXT NOT NULL,
    Timecodes TEXT,
    PrimaryInstructorID INTEGER,
    Capacity INTEGER NOT NULL,
    Actual INTEGER NOT NULL,
    Remaining INTEGER NOT NULL,
    FOREIGN KEY (CourseID) REFERENCES CATALOG_COURSES(CourseID),
    FOREIGN KEY (PrimaryInstructorID) REFERENCES INSTRUCTORS(InstructorID)
);
CREATE INDEX ix_course_offerings_alt_key on COURSE_OFFERINGS(Term,CatalogNum,Section);

-- Locations table
DROP TABLE IF EXISTS LOCATIONS;
CREATE TABLE LOCATIONS (
    LocationID INTEGER PRIMARY KEY,
    LocationCode TEXT NOT NULL
);

-- Course Meetings table
DROP TABLE IF EXISTS COURSE_MEETINGS;
CREATE TABLE COURSE_MEETINGS (
    CourseMeetingID INTEGER PRIMARY KEY,
    CourseOfferingID INTEGER NOT NULL,
    LocationID INTEGER NOT NULL,
    StartDateTime TEXT NOT NULL,
    EndDateTime TEXT NOT NULL,
    FOREIGN KEY (CourseOfferingID) REFERENCES COURSE_OFFERINGS(CourseOfferingID),
    FOREIGN KEY (LocationID) REFERENCES LOCATIONS(LocationID)
);

-- A conversion table for matching the term to the corresponding catalog year
-- Catalogs are not available for the first several years
DROP TABLE IF EXISTS TERM_CATALOG_YEAR;
CREATE TABLE TERM_CATALOG_YEAR (
    CatalogYear TEXT NOT NULL,
    Term TEXT NOT NULL
);

INSERT INTO TERM_CATALOG_YEAR (Term, CatalogYear) VALUES 
('Fall2017','2017_2018'),('Winter2018','2017_2018'),('Spring2018','2017_2018'),('Summer2018','2017_2018'),('Fall2018','2018_2019'),
('Winter2019','2018_2019'),('Spring2019','2018_2019');

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
7 rows affected.


[]

## 2. Extract data from CSV files
_The following code uses Python to automate the dirty work that you might do in `sqlite3`._

In [None]:
# Catalog Data
catalog_years = ['2017_2018','2018_2019']

for cat_year in catalog_years:
    filepath = 'banner/Catalogs/CourseCatalog'+cat_year+'.csv'
    data = pd.read_csv(filepath)
    data['cat_year'] = cat_year
    data.to_sql('IMPORT_CATALOG_COURSES',conn,if_exists='append',index=False)


In [None]:
# Course Offering and Course Meeting Data
terms = ['Fall2014','Fall2015','Fall2016','Fall2017','Fall2018',
         'Spring2015','Spring2016','Spring2017','Spring2018','Spring2019',
         'SpringBreak2017',
         'Summer2015','Summer2016','Summer2017','Summer2018',
         'Winter2015','Winter2016','Winter2017','Winter2018']

for term in terms:
    filepath = 'banner/'+term+'/courses.csv'
    data = pd.read_csv(filepath)
    data.to_sql('IMPORT_COURSE_OFFERINGS',conn,if_exists='append',index=False) 
    
    filepath = 'banner/'+term+'/course_meetings.csv'
    data = pd.read_csv(filepath)
    data.to_sql('IMPORT_COURSE_MEETINGS',conn,if_exists='append',index=False)

In [None]:
%%sql
-- Record Counts for Catalog Courses
SELECT 
    (SELECT Count(*) FROM IMPORT_CATALOG_COURSES) as 'RawCount',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_CATALOG_COURSES)) as 'DistinctCount';

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


RawCount,DistinctCount
4440,4440


In [None]:
%%sql
-- Record Counts for Course Offerings
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_OFFERINGS) as 'RawCount',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_OFFERINGS)) as 'DistinctCount';

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


RawCount,DistinctCount
15937,15937


In [None]:
%%sql 
-- Record Counts for Course Meetings
SELECT 
    (SELECT Count(*) FROM IMPORT_COURSE_MEETINGS) as 'RawCount',
    (SELECT Count(*) FROM (SELECT DISTINCT * FROM IMPORT_COURSE_MEETINGS)) as 'DistinctCount';

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


RawCount,DistinctCount
284907,284847


__Note: it looks like there are 60 duplicate course meetings in the course_meetings.csv data. We'll have to take care when loading the COURSE_MEETINGS table.__

In [None]:
%%sql 
SELECT term,crn,location, day, start 
FROM IMPORT_COURSE_MEETINGS
GROUP BY term,crn,location, day, start
HAVING COUNT(*)>1;

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


term,crn,location,day,start
Fall2014,73073,MCA 102,M,2014-09-08T18:30:00
Fall2014,73073,MCA 102,M,2014-09-15T18:30:00
Fall2014,73073,MCA 102,M,2014-09-22T18:30:00
Fall2014,73073,MCA 102,M,2014-09-29T18:30:00
Fall2014,73073,MCA 102,M,2014-10-06T18:30:00
Fall2014,73073,MCA 102,M,2014-10-20T18:30:00
Fall2014,73073,MCA 102,M,2014-10-27T18:30:00
Fall2014,73073,MCA 102,M,2014-11-03T18:30:00
Fall2014,73073,MCA 102,M,2014-11-10T18:30:00
Fall2014,73073,MCA 102,M,2014-11-17T18:30:00


**After consulting the original `banner.html` files, it appears that the duplicate meeting times are in Banner! Ugh. We'll just filter out the duplicates when populating our tables with data.**

## 3 & 4. Transform and Load Data Into ERD Tables

In [None]:
%%sql
DELETE FROM INSTRUCTORS;

INSERT INTO INSTRUCTORS (Name)
SELECT DISTINCT primary_instructor
FROM import_course_offerings 
WHERE primary_instructor <> 'TBA' AND primary_instructor NOT LIKE '%/%';

 * sqlite:///data6510/data/FinalProject/Banner.db
0 rows affected.
1095 rows affected.


[]

In [None]:
%%sql
DELETE FROM PROGRAMS;

INSERT INTO PROGRAMS (ProgramCode,ProgramName)
SELECT DISTINCT program_code,program_name 
FROM IMPORT_CATALOG_COURSES
ORDER BY program_code;

 * sqlite:///data6510/data/FinalProject/Banner.db
0 rows affected.
83 rows affected.


[]

In [None]:
%%sql 
DELETE FROM CATALOG_COURSES;

INSERT INTO CATALOG_COURSES (CatalogYear,ProgramID,CatalogNum,CourseTitle,Credits,Prereqs,Coreqs,Fees,Attributes,Description)
SELECT DISTINCT cat_year, ProgramID,catalog_id,course_title,credits,prereqs,coreqs,fees,attributes,description
FROM IMPORT_CATALOG_COURSES 
    JOIN PROGRAMS ON (program_code = ProgramCode);

 * sqlite:///data6510/data/FinalProject/Banner.db
0 rows affected.
4440 rows affected.


[]

In [None]:
%%sql
DELETE FROM COURSE_OFFERINGS;

INSERT INTO COURSE_OFFERINGS (CourseID,Term,CRN,CatalogNum,Section,Credits,Title,Timecodes,PrimaryInstructorID,Capacity, Actual, Remaining)
SELECT DISTINCT CourseID,term,crn,catalog_id,section,import_course_offerings.credits,title,timecodes,InstructorID,cap,act,rem 
FROM import_course_offerings 
    LEFT JOIN INSTRUCTORS ON (primary_instructor=INSTRUCTORS.Name)
    LEFT JOIN TERM_CATALOG_YEAR USING (Term)
    LEFT JOIN CATALOG_COURSES ON (catalog_id = CatalogNum AND CATALOG_COURSES.CatalogYear = TERM_CATALOG_YEAR.CatalogYear)
;

 * sqlite:///data6510/data/FinalProject/Banner.db
0 rows affected.
15937 rows affected.


[]

In [None]:
%%sql 
INSERT INTO LOCATIONS (LocationCode)
SELECT DISTINCT Location 
FROM import_course_meetings
ORDER BY Location

 * sqlite:///data6510/data/FinalProject/Banner.db
207 rows affected.


[]

In [None]:
%%sql
DELETE FROM COURSE_MEETINGS;

-- this one will take a while; be patient

INSERT INTO COURSE_MEETINGS (CourseOfferingID,LocationID,StartDateTime,EndDateTime)
SELECT DISTINCT COURSE_OFFERINGS.CourseOfferingID, LocationID,import_course_meetings.`Start`,import_course_meetings.`End`
FROM import_course_meetings 
    JOIN COURSE_OFFERINGS USING (Term,CRN)
    LEFT JOIN LOCATIONS ON (import_course_meetings.Location = LOCATIONS.LocationCode);

 * sqlite:///data6510/data/FinalProject/Banner.db
284847 rows affected.
Done.


[]

In [None]:
%%sql 
SELECT * FROM CATALOG_COURSES LIMIT 10;

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


CourseID,CatalogYear,CatalogNum,ProgramID,CourseTitle,Credits,Prereqs,Coreqs,Fees,Attributes,Description
1,2017_2018,AN 0301,4,Independent Study,1-3 Credits,,,,,Students undertake an individualized program of study in consultation with a director from the Asian studies faculty.
2,2017_2018,AN 0310,4,Asian Studies Seminar,3 Credits,,,,,"This seminar examines selected topics concerning Asia. This course is taught in conjunction with another 100-300 level course from a rotation of course offerings. Consult the Asian Studies director to identify the conjoined course for a given semester. The seminar concentrates on topics within the parameters of the conjoined course syllabus but adds research emphasis. Students registered for this course must complete a research project, to include 300-level research, in addition to the regular research requirements of the conjoined course, and a 25-50 page term paper in substitution of some portion of the conjoined course requirements, as determined by the instructor. Open to juniors and seniors only."
3,2017_2018,BU 0211,12,Legal Environment of Business,3 Credits,Junior standing.,,,,"This course examines the broad philosophical as well as practical nature and function of the legal system, and introduces students to the legal and social responsibilities of business. The course includes an introduction to the legal system, the federal courts, Constitutional law, the United States Supreme Court, the civil process, and regulatory areas such as employment discrimination, protection of the environment, and corporate governance and securities markets."
4,2017_2018,BU 0220,12,Environmental Law and Policy,3 Credits,,,,"EVME Environmental Studies Major Elective, EVPE Environmental Studies Elective, EVSS Environmental Studies: Social Science, MGEL Management: General Elective","This course surveys issues arising out of federal laws designed to protect the environment and manage resources. It considers in detail the role of the Environmental Protection Agency in the enforcement of environmental policies arising out of such laws as the National Environmental Policy Act, the Clean Water Act, and the Clear Air Act, among others. The course also considers the impact of Congress, political parties, bureaucracy, and interest groups in shaping environmental policy, giving special attention to the impact of environmental regulation on business and private property rights."
5,2017_2018,BU 0311,12,"The Law of Contracts, Sales, and Property",3 Credits,BU 0211.,,,,"This course examines the components of common law contracts including the concepts of offer and acceptance, consideration, capacity and legality, assignment of rights and delegation of duties, as well as discharge of contracts. The course covers Articles 2 and 2A of the Uniform Commercial Code relating to leases, sales of goods, and warranties. The course also considers personal and real property, and bailments."
6,2017_2018,BU 0312,12,The Law of Business Organizations and Financial Transactions,3 Credits,BU 0211.,,,,"This course offers an analysis of legal principles related to the law of agency, sole proprietorships, partnerships, corporations, limited liability companies, and other business forms. The second half of the course addresses several sections of the Uniform Commercial Code, such as negotiable instruments, bank collections and deposits and secured transactions. Finally, the course examines the law of suretyship, debtor-creditor relationships, and bankruptcy."
7,2017_2018,BU 0320,12,Employment Law and Discrimination in the Workplace,3 Credits,,,,"MGEL Management: General Elective, UDIV U.S. Diversity","This course examines a variety of legal issues related to the workplace including the doctrine of employment at will, employee privacy, and the history and development of labor unions and the legal protections afforded by the National Labor Relations Act. A study of the role of the Civil Rights Act of 1964 and the Equal Employment Opportunity Commission in eradicating discrimination based on race, sex, religion, national origin, age, and disability occupies a major portion of the course. Other employment issues include affirmative action, worker safety, and compensation."
8,2017_2018,BU 0391,12,Seminar in Business Law and Ethics,3 Credits,"AE 0291, BU 0211, two additional courses in law or applied ethics.",,,,This interdisciplinary study of these two aspects of the business environment is cross-listed as
9,2017_2018,BL 0101,11,Black Lives Matter,3 Credits,,,,"ASGW American Studies: Gateway, BSFC Black Studies Focus Course, BSSS Black Studies: Social and Behavioral Sciences, PJST Peace and Justice Studies, UDIV U.S. Diversity","In the context of Ferguson, Charleston, and other national crises, this course responds to the call of students from our campus community to raise questions about and critically reflect upon the failures of democracy to recognize the value of Black Life. This course employs collective thinking, teaching, and research to focus on questions surrounding race, structural inequality, and violence. It examines the historical, geographical, cultural, social, and political ways in which race has been configured and deployed in the United States. Various faculty will bring to bear their respective scholarly lenses so that students understand race and racism across intellectual disciplines."
10,2017_2018,BL 0398,11,Independent Study,1-3 Credits,,,,BSCP Black Studies Capstone Course,"Upon request and by agreement with a professor in the program, a Black Studies minor may conduct a one-semester independent study on a defined research topic or field of study."


## 5. Integrity Checks

### Domain Integrity
The SQLite data types are pretty limited, so there is not much to see here. A few specific value errors were corrected on import. 

### Entity Integrity

In [None]:
%%sql
-- There should be 4440 Catalog Entries, 15937 Course Offerings, and 284847 Course Meetings
SELECT 
    (SELECT Count(*) FROM CATALOG_COURSES) as CatalogCourses,
    (SELECT Count(*) FROM COURSE_OFFERINGS) as CourseOfferings,
    (SELECT Count(*) FROM COURSE_MEETINGS) as CourseMeetings;

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


CatalogCourses,CourseOfferings,CourseMeetings
4440,15937,284847


### Relational Integrity

In [None]:
%%sql 
-- A check of COURSE_OFFERINGS --> INSTRUCTORS, COURSE_OFFERINGS --> CATALOG_COURSES, CATALOG_COURSES --> PROGRAMS
SELECT CourseID,CatalogYear,Term,CRN, Section,COURSE_OFFERINGS.CatalogNum as CatNum, Title,Capacity,Actual,Remaining,substr(Term,-4) as Year
FROM COURSE_OFFERINGS 
    LEFT JOIN INSTRUCTORS ON (COURSE_OFFERINGS.PrimaryInstructorID = INSTRUCTORS.InstructorID)
    LEFT JOIN CATALOG_COURSES USING (CourseID)
    LEFT JOIN PROGRAMS USING (ProgramID)
WHERE Name like '%Huntley'
ORDER BY Year,Term DESC,CatNum,Section;

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


CourseID,CatalogYear,Term,CRN,Section,CatNum,Title,Capacity,Actual,Remaining,Year
,,Fall2014,70369,E,IS 0100,Intro to Information Systems,25,26,-1,2014
,,Fall2014,73060,A,IS 0135,Fundamentals of Web Design,25,26,-1,2014
,,Fall2014,73061,A,IS 0320,Systems Design and Implementation,25,16,9,2014
,,Fall2015,75231,E,IS 0100,Intro to Information Systems,29,28,1,2015
,,Fall2015,75246,F,IS 0100,Intro to Information Systems,29,28,1,2015
,,Fall2015,76388,A,IS 0135,Fundamentals of Web Design,25,21,4,2015
,,Fall2015,76389,A,IS 0320,Systems Design and Implementation,25,13,12,2015
,,Spring2016,38780,01,IS 0585,Contemporary Topics: Information Systems and Data,20,15,5,2016
,,Spring2016,37253,B,OM 0101,Operations Management,29,28,1,2016
,,Spring2016,37254,C,OM 0101,Operations Management,29,29,0,2016


__Note: The Course Catalog data is missing BA 510! The website was updated after the data was scraped in January 2019.__

In [None]:
%%sql
-- A check of the COURSE_MEETINGS --> COURSE_OFFERINGS relationship
SELECT Term, CourseOfferingID, Count(CourseMeetingID)
FROM COURSE_OFFERINGS JOIN COURSE_MEETINGS USING (CourseOfferingID)
WHERE CRN=39006 and Term = 'Spring2019'
GROUP BY CourseOfferingID;

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


Term,CourseOfferingID,Count(CourseMeetingID)
Spring2019,13345,9


In [None]:
%%sql
-- Check the course meetings for a known course offering
SELECT CourseOfferingID, CourseMeetingID, StartDateTime,EndDateTime
FROM COURSE_OFFERINGS JOIN COURSE_MEETINGS USING (CourseOfferingID)
WHERE CRN=39006 AND Term="Spring2019"

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


CourseOfferingID,CourseMeetingID,StartDateTime,EndDateTime
13345,253416,2019-01-29T18:30:00,2019-01-29T21:30:00
13345,253417,2019-02-05T18:30:00,2019-02-05T21:30:00
13345,253418,2019-02-12T18:30:00,2019-02-12T21:30:00
13345,253419,2019-02-26T18:30:00,2019-02-26T21:30:00
13345,253420,2019-03-05T18:30:00,2019-03-05T21:30:00
13345,253421,2019-03-12T18:30:00,2019-03-12T21:30:00
13345,253422,2019-02-02T09:00:00,2019-02-02T16:00:00
13345,253423,2019-02-16T09:00:00,2019-02-16T16:00:00
13345,253424,2019-03-09T09:00:00,2019-03-09T16:00:00


## 6. Empty out the `IMPORT_` tables to reclaim storage space

In [None]:
%%sql
-- Delete all data
DELETE FROM IMPORT_CATALOG_COURSES;
DELETE FROM IMPORT_COURSE_OFFERINGS;
DELETE FROM IMPORT_COURSE_MEETINGS;

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.
15937 rows affected.
284907 rows affected.


[]

In [None]:
%%sql
-- Drop the tables
DROP TABLE IMPORT_CATALOG_COURSES;
DROP TABLE IMPORT_COURSE_OFFERINGS;
DROP TABLE IMPORT_COURSE_MEETINGS; 

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.
Done.
Done.


[]

In [None]:
%%sql
-- Force SQLite to rebuild the database file to minimize file size
vacuum;

 * sqlite:///data6510/data/FinalProject/Banner.db
Done.


[]