<a href="https://colab.research.google.com/github/cobrien87/work/blob/main/MiniLesson2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mini Lesson 2 
We will be working with real data related to scanning times and attendance of a school.

*Instructions*  
*Step 1 - DOWNLOAD*:  Download the CSV file:  [scanTimes.csv](https://drive.google.com/file/d/198mq2U5ST2xVpRQkPVGHVfNYaL9vZWb9/view?usp=sharing), .  
*Step 2 - UPLOAD*:  Upload to the sample_data folder of Google Colab the three files you just downloaded:  scanTimes.csv. 

In [None]:
import pandas as pd
import sqlite3

def pd_to_sqlDB(input_df: pd.DataFrame,
                table_name: str,
                db_name: str = 'default.db') -> None:
    # Step 1: Setup local logging
    import logging
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

    # Step 2: Find columns in the dataframe
    cols = input_df.columns
    cols_string = ','.join(cols)
    val_wildcard_string = ','.join(['?'] * len(cols))

    # Step 3: Connect to a DB file if it exists, else crete a new file
    con = sqlite3.connect(db_name)
    cur = con.cursor()
    logging.info(f'SQL DB {db_name} created')

    # Step 4: Create Table
    sql_string = f"""CREATE TABLE {table_name} ({cols_string});"""
    cur.execute(sql_string)
    logging.info(f'SQL Table {table_name} created with {len(cols)} columns')

    # Step 5: Upload the dataframe
    rows_to_upload = input_df.to_dict(orient='split')['data']
    sql_string = f"""INSERT INTO {table_name} ({cols_string}) VALUES ({val_wildcard_string});"""
    cur.executemany(sql_string, rows_to_upload)
    logging.info(f'{len(rows_to_upload)} rows uploaded to {table_name}')
  
    # Step 6: Commit the changes and close the connection
    con.commit()
    con.close()


def sql_query_to_pd(sql_query_string: str, db_name: str ='default.db') -> pd.DataFrame:  
    # Step 1: Connect to the SQL DB
    con = sqlite3.connect(db_name)

    # Step 2: Execute the SQL query
    cursor = con.execute(sql_query_string)

    # Step 3: Fetch the data and column names
    result_data = cursor.fetchall()
    cols = [description[0] for description in cursor.description]

    # Step 4: Close the connection
    con.close()

    # Step 5: Return as a dataframe
    return pd.DataFrame(result_data, columns=cols)

In [None]:
#Scan TABLE
# Step 1: Read the csv file into a dataframe
input_df = pd.read_csv('sample_data/scanTimes.csv')
 
# Step 2: Upload the dataframe to a SQL Table
pd_to_sqlDB(input_df,
            table_name='Scan',
            db_name='default.db')

2022-02-28 22:39:26 INFO: SQL DB default.db created
2022-02-28 22:39:26 INFO: SQL Table Scan created with 6 columns
2022-02-28 22:39:26 INFO: 3370 rows uploaded to Scan


# ORDER BY
The ORDER BY keyword is used to sort the result-set in ascending or descending order.

Syntax:
```
SELECT column1, column2, ...
FROM table_name
ORDER BY column1, column2, ... ASC|DESC;
```

More on ORDER BY: [W3 Schools](https://www.w3schools.com/sql/sql_orderby.asp)


In [None]:
#Select all rows from Scan table and Order By scantime
sql_query_string = """
    SELECT * 
    FROM Scan
    ORDER BY ScanTime ASC
"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

Unnamed: 0,First,Last,StudentID,Grade,ScanTime,Status
0,Cory,Gibes,4232820,12,1/24/2022 10:01,Tardy
1,Adria,Gilliam,3557812,12,1/24/2022 10:12,Tardy
2,Sabine,Kaea,7171441,9,1/24/2022 10:20,Tardy
3,Dottie,Hellickson,8752584,12,1/24/2022 10:21,Tardy
4,Bette,Nicka,8513312,12,1/24/2022 10:47,Tardy
...,...,...,...,...,...,...
3365,Leontine,Stalma,1033112,12,1/28/2022 9:48,Tardy
3366,Basilia,Vinroe,1566027,12,1/28/2022 9:51,Tardy
3367,Frank,Amend,5618461,9,1/28/2022 9:57,Tardy
3368,Adria,Gilliam,3557812,12,1/28/2022 9:59,Tardy


In [None]:
#Select all rows from Scan table Where Status is Tardy and Order By scantime
sql_query_string = """
    SELECT * FROM Scan 
    WHERE Status = 'Tardy'
    ORDER BY ScanTime ASC
"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

Unnamed: 0,First,Last,StudentID,Grade,ScanTime,Status
0,Cory,Gibes,4232820,12,1/24/2022 10:01,Tardy
1,Adria,Gilliam,3557812,12,1/24/2022 10:12,Tardy
2,Sabine,Kaea,7171441,9,1/24/2022 10:20,Tardy
3,Dottie,Hellickson,8752584,12,1/24/2022 10:21,Tardy
4,Bette,Nicka,8513312,12,1/24/2022 10:47,Tardy
...,...,...,...,...,...,...
588,Leontine,Stalma,1033112,12,1/28/2022 9:48,Tardy
589,Basilia,Vinroe,1566027,12,1/28/2022 9:51,Tardy
590,Frank,Amend,5618461,9,1/28/2022 9:57,Tardy
591,Adria,Gilliam,3557812,12,1/28/2022 9:59,Tardy


# COUNT(), AVG(), SUM()
These functions help you take the count, average or sum of a numerical column

COUNT Syntax:
```
SELECT COUNT(column_name)
FROM table_name
WHERE condition;
```
AVG Syntax:
```
SELECT AVG(column_name)
FROM table_name
WHERE condition;
```
SUM Syntax:
```
SELECT SUM(column_name)
FROM table_name
WHERE condition;
```
More on this: [W3 Schools](https://www.w3schools.com/sql/sql_count_avg_sum.asp)

In [None]:
#Finding the total number of on times in our Scan table
sql_query_string = """
    SELECT COUNT(Status)
    FROM Scan
    WHERE Status = 'Present on time'
"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

Unnamed: 0,COUNT(Status)
0,2777


In [None]:
#TODO: Find the total number of tardies in our Scan table 
#PUT YOUR COMMENT HERE
sql_query_string = """
    SELECT COUNT(Status)
    FROM Scan
    WHERE Status = 'Tardy'
"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

Unnamed: 0,COUNT(Status)
0,593


In [None]:
#TODO: Find the total number of 11th graders in our Scan table 
sql_query_string = """
    SELECT COUNT(Grade)
    FROM Scan
    WHERE Grade = 11 OR Grade = 12

"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

Unnamed: 0,COUNT(Grade)
0,1638


In [None]:
#TODO: Find the total number of 11th graders in our Scan table whose last name starts with M
sql_query_string = """
    YOUR QUERY HERE
"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

# GROUP BY
The GROUP BY statement groups rows that have the same values into summary rows, like "find the number of customers in each country".

The GROUP BY statement is often used with aggregate functions (COUNT(), MAX(), MIN(), SUM(), AVG()) to group the result-set by one or more columns.

Syntax:
```
SELECT column_name(s)
FROM table_name
WHERE condition
GROUP BY column_name(s)
ORDER BY column_name(s);
```

More on GROUP BY: [W3 Schools](https://www.w3schools.com/sql/sql_groupby.asp)

In [None]:
#Organizing our Scan table into groups based on status
#By selecting the Status column in addition to 
#COUNT(Status), it lets us know which Group is related to
#which count
sql_query_string = """
    SELECT COUNT(Status), Status
    FROM Scan
    GROUP BY Status
"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

Unnamed: 0,COUNT(Status),Status
0,2777,Present on time
1,593,Tardy


In [None]:
#TODO: Select the records/rows from Scan table where students are Tardy and GROUP BY Grade
sql_query_string = """
    SELECT COUNT(Grade), Grade
    FROM Scan
    WHERE Status = 'Tardy'
    GROUP BY Grade
"""
 
#Exectue the SQL query
result_df = sql_query_to_pd(sql_query_string, db_name='default.db')
result_df

Unnamed: 0,COUNT(Grade),Grade
0,99,9
1,123,10
2,138,11
3,233,12
