# A very straightforward notebook
### to bring together a range of useful basic data eng and data sci techniques

Contents:
  Import statement
  

### Broad import statement

In [2]:
# data sourcing:
import csv

# databases:
import sqlite3

#

# Python Basics

In [7]:
# string methods:
# https://docs.python.org/3.7/tutorial/introduction.html#strings
a_string = "some. thing. thingy"
a_word = "thing"
a_substring = 'ing'
a_string.replace(" ", ", ") # replaces . with a ,
a_word in a_string # returns true if found
a_string.count(a_substring) # returns how many times a_substring occurs in a_string
a_string[-1] == "?" # true if ends with ?
a_string.strip() # removes trailing and preceding whitespace
a_string.capitalize()
first_name = "john"
last_name = "doe"
age = 33
"{} {} is {}".format(first_name.capitalize(), last_name.capitalize(), str(age)) # formatting string
f"string formats string {a_word} and ints {str(age)}"

'string formats string thing and ints 33'

# basic data structures:
- [Lists](https://docs.python.org/3.7/tutorial/introduction.html#lists), called _array_ in other languages
- [More on lists](https://docs.python.org/3.7/tutorial/datastructures.html#more-on-lists)
- [List Comprehensions](https://docs.python.org/3.7/tutorial/datastructures.html#list-comprehensions)
- [Tuples](https://docs.python.org/3.7/tutorial/datastructures.html#tuples-and-sequences)
- [Dictionaries](https://docs.python.org/3.7/tutorial/datastructures.html#dictionaries), called _hash_ or _hashmap_ in other languages
- [Looping Techniques](https://docs.python.org/3.7/tutorial/datastructures.html#looping-techniques) with the `for` keyword

In [8]:
## remember, the following are found in the data-challenges-reboot 01-Python/01-Programming-Basics folder:
    # debugging
    # system parameters i.e: sys.argv[1]
    # environment variables, i.e: os.getenv("FLASK_ENV")

## Data Sourcing / Source data
### Found in 01-Python/02-Data-Sourcing

In [None]:
# reading csv files into python (as list):

print("with csv.reader(file): \n")
with open('../data/phone_book.csv', mode='r') as csv_file:
    csv_reader = csv.reader(csv_file)
    line_count = 0
    for row in csv_reader:
        if line_count != 0: # ie. skip header
            print(', '.join('%s'%x for x in row))
        line_count += 1

In [None]:
# reading csv files into python (as dict):

print("with csv.dictreader: \n")
with open('../data/phone_book.csv', mode='r') as csv_file:
    dict_reader = csv.DictReader(csv_file)
    for row in dict_reader:
        print(row.values())
        print(", ".join('%s'%x for x in row.values()))

In [13]:
# APIs and scraping: see examples in 01-Python/02-Data-Sourcing

# SQL and database basics
## use db:
### https://www.kaggle.com/datasets/hugomathien/soccer
 

### DBeaver:

In [None]:
# install dbeaver with
# sudo add-apt-repository ppa:serge-rider/dbeaver-ce
# sudo apt-get update
# sudo apt-get install dbeaver-ce

# run from the command line with:
# dbeaver-ce

In [None]:
# order of SQL operations matters:

![image.png](https://cdn.sisense.com/wp-content/uploads/image-1-order-blog.png)

In [3]:
# python access to database:
db_path_movies = '../data/movies.sqlite'
# import sqlite3
conn = sqlite3.connect(db_path_movies)
c = conn.cursor()

In [33]:
# SQL run the commands:
c.execute("SELECT * FROM directors")
rows = c.fetchall()
first_row = rows[0]
first_row[0]

'Georges Méliès'

In [4]:
# SQL fetch list of row elements:
conn = sqlite3.connect(db_path_movies)
conn.row_factory = sqlite3.Row
c = conn.cursor()

c.execute("SELECT * FROM directors")
rows = c.fetchall()
first_row = rows[0]

first_row['birth_year'] # table field names: name, birth_year, death_year, imdb_director_id, id
first_row.keys()

['name', 'birth_year', 'death_year', 'imdb_director_id', 'id']

In [42]:
tuple(first_row)

('Georges Méliès', 1861, 1938, 'nm0617588', 1)

In [43]:
list(first_row)

['Georges Méliès', 1861, 1938, 'nm0617588', 1]

In [47]:
# SQL use fetchone when relevant:
c.execute("SELECT * FROM directors WHERE directors.imdb_director_id = 'nm0000186'")
row = c.fetchone()
print(row[0], '-' ,row[1])

David Lynch - 1946


In [52]:
# SQL Projection - select columns in the query:
query = "SELECT d.id, d.name, d.birth_year FROM directors AS d"
# NB - alias

# SQL Selection- select row:
query = """
SELECT *
FROM directors AS d
WHERE d.name = 'D.W. Griffith'
"""
c.execute(query)
row = c.fetchone()
print(f'{row[0]} {row[1]} {row[2]} {row[3]} {row[4]}')

D.W. Griffith 1875 1948 nm0000428 4


In [53]:
# SQL  multiple selections:

query ="""SELECT *
FROM directors AS d
WHERE d.name = 'D.W. Griffith'
OR d.name = 'David Lynch'"""

query = """SELECT *
FROM directors AS d
WHERE d.name IN ('D.W. Griffith', 'David Lynch')"""

query ="""SELECT *
FROM movies AS m
WHERE UPPER(m.title) LIKE 'THE %'""" # all movies starting with "THE "

In [None]:
# SQL counting
q = """SELECT COUNT(directors.id)
FROM directors
WHERE directors.birth_year >= 1980
"""

In [85]:
# SQL sorting:
q = """SELECT *
FROM directors AS d
ORDER BY d.birth_year DESC
LIMIT 10"""

In [100]:
query = """SELECT d.name
            FROM directors AS d
            ORDER BY d.name
            LIMIT 10"""
c.execute(query)
rows = c.fetchall()
first_row = rows[0]

print(rows)

# for r in rows:
#     print(f"{r[0]}")

[('A.R. Murugadoss',), ('Aamir Khan',), ('Aanand L. Rai',), ('Aaron Hann',), ('Aaron Harvey',), ('Aaron Horvath',), ('Aaron Lipstadt',), ('Aaron Moorhead',), ('Aaron Schneider',), ('Aaron Seltzer',)]


In [56]:
db_path_soccer = '../data/soccer.sqlite'
conn = sqlite3.connect(db_path_soccer)
conn.row_factory = sqlite3.Row
c = conn.cursor()

In [58]:
# GROUPING:
# Grouping rows on a given column C (aggregating rows with a function where values of C column are the same)

# how many matches played per country:
query = """SELECT COUNT(matches.id), matches.country_id
FROM "Match" AS matches
GROUP BY matches.country_id"""

c.execute(query)
row = c.fetchone()
print(f'{row[0]}')

1728


In [71]:
# What if we want to sort those results? We need an alias:
query1 = """SELECT COUNT(matches.id) AS match_count, matches.country_id
FROM "Match" AS matches
GROUP BY matches.country_id
ORDER BY match_count DESC"""

query2 = """SELECT COUNT(matches.id) AS match_count, matches.country_id
FROM "Match" AS matches
GROUP BY matches.country_id
HAVING match_count >= 3000
ORDER BY match_count DESC"""

c.execute(query2)
rows = c.fetchall()

for r in rows:
    print(f"Matches: {r['match_count']} Country id: {r['country_id']}")


Matches: 3040 Country id: 21518
Matches: 3040 Country id: 4769
Matches: 3040 Country id: 1729
Matches: 3017 Country id: 10257


In [76]:
# naming outcomes and calculating using CASE & END AS

# note conditional equals in SQL as single =

# How many matches were
    # won by the home team?
    # won by the away team?
    # finished with a draw?

query = """
SELECT
COUNT(matches.id) AS outcome_count,
CASE
    WHEN matches.home_team_goal > matches.away_team_goal
        THEN 'home_win'
    WHEN matches.home_team_goal = matches.away_team_goal
        THEN 'draw'
    ELSE 'away_win'
END AS outcome
FROM "Match" AS matches
GROUP BY outcome
ORDER BY outcome_count DESC"""

c.execute(query)
rows = c.fetchall()
first_row = rows[0]
first_row.keys()

for r in rows:
    print(f"{r['outcome_count']} {r['outcome']}")


11917 home_win
7466 away_win
6596 draw


In [84]:
# Querying multiple tables:

query = """
SELECT League.name as LeagueName, Country.name as CountryName
FROM League
JOIN Country ON League.country_id = Country.id"""

c.execute(query)
rows = c.fetchall()
first_row = rows[0]
first_row.keys()

for r in rows:
    print(f" {r['LeagueName']}") # including CountryName looks like a duplicate

 Belgium Jupiler League
 England Premier League
 France Ligue 1
 Germany 1. Bundesliga
 Italy Serie A
 Netherlands Eredivisie
 Poland Ekstraklasa
 Portugal Liga ZON Sagres
 Scotland Premier League
 Spain LIGA BBVA
 Switzerland Super League


In [None]:
# string selection with SQL with (movies db):
exclude = 'cloverfield'
query = f'''SELECT m.title
        FROM movies AS m
        WHERE UPPER(title) LIKE '% LOVE %'
        OR UPPER(title) LIKE 'LOVE %'
        OR UPPER(title) LIKE 'LOVE,%'
        AND m.title NOT LIKE "%{exclude}%"
        ORDER BY m.title'''

In [None]:
# doing maths on the fly and naming columns:
query = """SELECT d.name,
            (m.start_year - d.birth_year) age
            FROM directors AS d
            JOIN movies AS m ON m.director_id = d.id
            WHERE age IS NOT NULL
            ORDER BY age
            LIMIT 5
            """

In [None]:
# numerical bucketing is simply done this way:
# 30 buckets achieved by / 30 + 1, with *30 gives the labels to time_range
query = """SELECT
        (minutes / 30 + 1)*30 time_range,
        COUNT(*)
    FROM movies
    WHERE minutes IS NOT NULL
    GROUP BY time_range"""
    # neat and quick