In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Browsing directories

In [None]:
# Listing a directory with `os.listdir`
import os

files = os.listdir('.')
files

In [None]:
# Listing a directory with `os.listdir`
import os

files = os.scandir('.')
[file.name for file in files]

In [None]:
# Listing all files that end with .html
import os
import re

files = os.scandir('.')
[file.name for file in files if file.is_file() & (re.match('^.*\.html', file.name) is not None)]

In [None]:
# Listing all files using glob
from glob import glob

files = glob('*')
files

In [None]:
# Listing all .html files using glob
from glob import glob

glob('*.html')

In [None]:
# Listing all .html files in all sub-directories
import os
import re

def get_html_files(folder):
    # Get all entries in current folder
    entries = os.scandir(folder)
    # Prepare a list that will store all .html files found in current folder
    files = []
    for entry in entries:
        if entry.is_dir():
            # If current entry is a directory, call `get_html_files` on the sub-directory
            files.extend(get_html_files(entry.path))
        elif entry.is_file():
            # If current entry is a file, check if it ends with .html
            # If yes, append it to the `files` list
            if re.match('.*\.html', entry.name) is not None:
                files.append(entry.path)
                
    # And return all .html files found in current folder
    return files

get_html_files('.')

In [None]:
from glob import glob

# `**` specifies to glob that it should match any folder or sub-folder
glob('**/*.html', recursive=True)

## Handling files

In [None]:
# Open a file and write two lines into it
f = open('file.txt', 'w')
f.write('This is some text')
f.write('\n')
f.write('This is some text on next line')
f.close()

In [None]:
# .read returns the contents of the whole file as a string

f = open('file.txt', 'r')
f.read()
f.close()

In [None]:
# .readlines returns the contents of the whole file split by lines as a list

f = open('file.txt', 'r')
f.readlines()
f.close()

In [None]:
# .readline returns the current line as a string

f = open('file.txt', 'r')
f.readline()
f.close()

In [None]:
# Essentially, .read().split('\n') returns .readlines()

f = open('file.txt', 'r')
f.read().split('\n')
f.close()

In [None]:
# Reading file with .with

with open('file.txt', 'r') as f:
    f.readlines()

In [None]:
# Reading and parsing a CSV this way is annoyingly long
import numpy as np

first_line = True
with open('df_example.csv', 'r') as f:
    data = []
    for line in f:
        if first_line is True:
            first_line = False
        else:
            data.append([float(n) for n in line.split(',')])
    
data = np.array(data)
data

## Pandas

### Series and DataFremes

In [None]:
import pandas as pd

pd.read_csv('df_example.csv')

In [None]:
# pandas.Series example

pd.Series([1, 2, 3, 4, 5])

In [None]:
# pandas.DataFrame example

pd.DataFrame([[1, 2, 3], [4, 5, 6]])

In [None]:
# pandas.DataFrame from multiple pandas.Series
s1 = pd.Series([1, 2, 3])
s2 = pd.Series([4, 5, 6])

pd.DataFrame([s1, s2])

In [None]:
# pandas.DataFrame with named rows and columns using `index` and `columns`
pd.DataFrame(
    [[1, 2, 3], [4, 5, 6]],
    index=['row1', 'row2'],
    columns=['col1', 'col2', 'col3']
)

In [None]:
# pandas.DataFrame with named rows and columns using a dictionary
pd.DataFrame(
    {'col1': {
        'row1': 1, 
        'row2': 2, 
        'row3': 3
    }, 
     'col2': {
        'row1': 4, 
        'row2': 5, 
        'row3':6}
    },
)

# A little more practical is to specify just the columns and pass the index
pd.DataFrame(
    {'col1': [1, 2, 3], 'col2': [4, 5, 6]},
    index=['row1', 'row2', 'row3']
)

### Accessing data

In [None]:
# We're gonna be using this example DataFrame for the next examples
df = pd.read_csv('df_example.csv')

In [None]:
# A preview of a DataFrame using .head and .tail

df.head()
df.tail(3)

In [None]:
# Name of the rows and columns

df.index
df.columns

In [None]:
# Accesing data using the bracket notation

# Column called `dim_1`
df['dim_1']

# Column called `dim_1`, first row
df['dim_1'][0]

# Columns called `dim_1` and `dim_2`
df[['dim_1', 'dim_2']]

# Columns called `dim_1` and `dim_2`, first row
df[['dim_1', 'dim_2']][0:1]

In [None]:
# Accesing data using .loc

# Column called `dim_1`
df.loc[:, 'dim_1']

# Column called `dim_1`, first row
df.loc[0, 'dim_1']

# Columns called `dim_1` and `dim_2`
df.loc[:, ['dim_1', 'dim_2']]

# Columns called `dim_1` and `dim_2`, first row
df.loc[0, ['dim_1', 'dim_2']]

In [None]:
# Accesing data using .iloc

# Second column
df.iloc[:, 1]

# First row, second column
df.iloc[0, 1]

# Second and third column
df.iloc[:, [1, 2]]

# First row, second and third column
df.iloc[0, [1, 2]]

### Boolean indexing and filtering

In [None]:
# Conditions return a boolean mask...

df > 0

In [None]:
# ...which can be further use to filter out values

df[df > 0]

In [None]:
# Filtering done with columns will completely drop rows that do not meet the condition

df[df['dim_0'] > 0].head()

In [None]:
# Boolean indexing can also be used for setting a value

df[df > 0] = 10
df.head()

In [None]:
# We can also search if given columns contains a value from a list...

df['text'] = np.random.choice(['one', 'two', 'three', 'four', 'five'], 100)
df[df['text'].isin(['one', 'two'])].head()

In [None]:
# ...or perform string operations on the whole column

df[df['text'].str.startswith('t')].head()

In [None]:
# And we can also filter by multiple conditions

df[
    (df['text']).isin(['one', 'two']) &
    (df['dim_0'] > -1) &
    (df['dim_5'] < -1)
].head()

## Missing data

In [None]:
df = pd.read_csv('df_example.csv')

df = df[df < 2]

In [None]:
df.head()

In [None]:
# Check what cells contain a missing value

df.isna().head()

In [None]:
# View all rows that contain a missing value

df[df.isna().any(axis=1)].head()

In [None]:
# Drop rows that contain a cell with missing value

df.dropna(axis=0).head()

In [None]:
# Drop columns that contain a cell with missing value

df.dropna(axis=1).head()

In [None]:
# Fill all missing values with a zero

df.fillna(0).head()

In [None]:
# Fill all missing values the previous one

df.fillna(method='ffill', axis=0).head()

In [None]:
# Linearly interpolate missing values

df.interpolate().head()

In [None]:
# Dropping elements that are of undesider data type

df.loc[1, 'dim_1'] = 'some_text'

def drop_undesired_types(df, dtype_to_drop, axis=0):
    t = dtype_to_drop if type(dtype_to_drop) == list else [dtype_to_drop]
    return df.applymap(lambda x: None if type(x) in t else x).dropna(axis=axis)

drop_undesired_types(df, str)

In [None]:
# Processing large amount of data using chunksize

i = 0
for chunk in pd.read_csv('df_example.csv', chunksize=10):
    i += 1
    print(f"Chunk {i}: {chunk['dim_1'].mean()}")

### Grouping and aggregating

In [None]:
df = pd.read_csv('df_example.csv')
df['text'] = np.random.choice(['one', 'two', 'three', 'four', 'five'], 100)
df['binary_category'] = np.random.choice([0, 1], 100)

In [None]:
# Calculate the mean of each group

df.groupby('text').agg('mean')

In [None]:
# Groupby over multiple columns and calculate multiple stats

df.groupby(['binary_category', 'text']).agg(['mean', 'std'])

In [None]:
# Provide custom function to `agg`

df.groupby(['binary_category', 'text']).agg({
    'dim_1': lambda x: np.sum(np.round(x)**3),
    'dim_2': 'sum'
})

### Merging and joining

In [None]:
# Concatenate rows and reset index
df1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
df2 = pd.DataFrame([[7, 8, 9]])

pd.concat((df1, df2)).reset_index(drop=True)

In [None]:
# Join two dataframes based on key
df1 = pd.DataFrame([[1, 2, "one"], [4, 5, "two"]])
df2 = pd.DataFrame([[7, 8, "one"]])

pd.merge(df1, df2, on=2, how='inner')

## sqlite3

In [None]:
import sqlite3

In [None]:
conn = sqlite3.connect('example.db')
c = conn.cursor()

c.execute('DROP TABLE IF EXISTS example_table')
c.execute('CREATE TABLE example_table (dim_1 real, dim_2 real)')
c.executemany('INSERT INTO example_table VALUES (?, ?)', np.random.randn(100, 2))

for row in c.execute('SELECT * FROM example_table ORDER BY dim_1 DESC'):
    print(f'{row[0]:.2f}', end=' ')

conn.commit()
conn.close()

## OOP

In [None]:
# A class for a rectangle
# Requires two attributes to construct an object: lengths of the two sides
class Rectangle:
    shape = "rectangle"
    
    def __init__(self, a, b):
        print(f"You created a {self.shape} with sides a = {a} and b = {b}")
        self.a = a
        self.b = b
        
    def calc_perimeter(self):
        return 2*self.a+2*self.b
    
    def calc_area(self):
        return self.a*self.b
    
    #def __str__(self):
    #    return f"{self.shape}(a = {self.a}, b = {self.b})"

In [None]:
r1 = Rectangle(4, 5)
print(f'a = {r1.a} and b = {r1.b}')
print(f'perimeter = {r1.calc_perimeter()}')
print(f'are = {r1.calc_area()}')

In [None]:
# The textual representation of an object is not very nice, but that can be adjusted using the __str__ method
print(r1)

In [None]:
# Square inherits the attributes of rectangle
# We don't have to implement the methods again
class Square(Rectangle):
    shape = "square"
    
    def __init__(self, a):
        # Square is a rectangle with both sides equal
        # Therefore we just construct this as a Rectangle with the same argument twice
        return super().__init__(a, a)
    
    def __str__(self):
        return super().__str__()

In [None]:
s1 = Square(4)
s1.calc_perimeter()

In [None]:
print(s1)

In [None]:
# We can also override methods to be able to perform mathematical operations
class Rectangle:
    shape = "rectangle"
    
    def __init__(self, a, b):
        self.a = a
        self.b = b
        
    def calc_perimeter(self):
        return 2*self.a+2*self.b
    
    def calc_area(self):
        return self.a*self.b
    
    def __add__(self, other):
        return Rectangle(self.a + other.a, self.b + other.b)
    
    def __sub__(self, other):
        return Rectangle(self.a - other.a, self.b - other.b)
    
    def __mul__(self, other):
        return Rectangle(self.a * other.a, self.b * other.b)
    
    def __str__(self):
        return f"{self.shape}(a = {self.a}, b = {self.b})"
    
r1 = Rectangle(5, 9)
r2= Rectangle(3, 5)

print(r1 + r2)
print(r1 - r2)
print(r1 * r2)

In [None]:
# By default, two objects are never equal
r1 = Rectangle(0, 0)
r2 = Rectangle(0, 0)
r1 == r2

In [None]:
# However, we can override the default function that checks for equality
class Rectangle:   
    def __init__(self, a, b):
        self.a = a
        self.b = b
        
    def calc_perimeter(self):
        return 2*self.a+2*self.b
    
    def calc_area(self):
        return self.a*self.b
    
    def __eq__(self, other):
        return self.a == other.a and self.b == other.b
    
    def __str__(self):
        return f"{self.shape}(a = {self.a}, b = {self.b})"
    
r1 = Rectangle(0, 0)
r2 = Rectangle(0, 0)
r1 == r2