In below,

Parallel processing

1. multiprocessing

In [None]:
from multiprocessing import Pool, cpu_count

def main_multiprocessing_func(func, list_of_things_to_be_processed, num_processes=None):
    
    # If num_processes is not specified, default to minimum(#things_to_be_processed, #machine-cores)
    if num_processes==None:
        num_processes = min(len(list_of_things_to_be_processed), cpu_count())
    
    # 'with' context manager takes care of pool.close() and pool.join() for us
    with Pool(num_processes) as pool:
        
        # we need a list to be passed to pool.map
        # pool.map returns results as a list
        results_list = pool.map(func, list_of_things_to_be_processed)
        
        # return list of processed columns, concatenated together as a new dataframe
        return pd.concat(results_list)

2. joblib

    See examples here: https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html

In [None]:
from math import sqrt
from joblib import Parallel, delayed
Parallel(n_jobs=1)(delayed(sqrt)(i**2) for i in range(10))

3. Dask

    For manipulating large datasets, when those datasets don’t fit in memory
    
    For accelerating long computations by using many cores
    
    See examples here: https://docs.dask.org/en/latest/index.html and https://docs.dask.org/en/latest/dataframe.html


<br>
<br>

In below,

If a dataframe's column names are `["s_apple", "s_banana", "s_orange"]`, and want to change them to `["apple", "banana", "orange"]`

In [None]:
df.rename(columns=lambda x: int(x[2:]), inplace=True)

<br>
<br>

In below,

SQL sampling

In [None]:
...order by random() limit 10000

In the SQL below,

1. md5 is hashing
2. || is the String Concatenation Operator

In [None]:
seed = 42
md5({seed} || customer_id) as random_customer_id

The usage of SQL,

`COALESCE` returns the first non-null value in a list.

`NULLIF` returns NULL if two expressions are equal, otherwise it returns the first expression.

`group by 1, 2, 3` groups by output columns (position 1, 2, 3)

<br>
<br>

In below,

turn values into ranks

In [10]:
col_names =["apple", "banana", "pear", "orange"]
data = pd.DataFrame(np.random.rand(6, 4), columns=col_names)
data

Unnamed: 0,apple,banana,pear,orange
0,0.301489,0.98308,0.437496,0.911782
1,0.663863,0.278423,0.194561,0.809135
2,0.003051,0.654187,0.01232,0.080039
3,0.962843,0.251235,0.021869,0.892796
4,0.01476,0.08786,0.113315,0.207766
5,0.585164,0.62074,0.466001,0.954973


In [12]:
rank = np.empty((len(data), len(col_names)))
o = data[col_names].values.argsort(1)
rank[np.arange(len(data))[:, None], o] = np.arange(len(col_names))[None]
rank

array([[0., 3., 1., 2.],
       [2., 1., 0., 3.],
       [0., 3., 1., 2.],
       [3., 1., 0., 2.],
       [0., 1., 2., 3.],
       [1., 2., 0., 3.]])

<br>
<br>

In below,

use double `argsort()` to convert numbers into rank

In [1]:
import numpy as np
x = np.array([0.3, 0.1, 0.2])

# Default is axis=-1 (the last axis)
x.argsort(axis=-1)

array([1, 2, 0])

In [2]:
x.argsort(axis=-1).argsort(axis=-1)

array([2, 0, 1])

One more `argsort` would inverse it back

In [3]:
x.argsort(axis=-1).argsort(axis=-1).argsort(axis=-1)

array([1, 2, 0])

In [4]:
x = np.array([
    [
        [0.3, 0.1, 0.2],
        [0.2, 0.7, 0.3]
    ],
    [
        [0.5, 0.7, 0.2],
        [0.2, 0.8, 0.3]
    ],
    [
        [0.6, 0.5, 0.5],
        [0.8, 0.4, 0.6]
    ],
    [
        [0.4, 0.5, 0.2],
        [0.1, 0.2, 0.6]
    ]
])

x.argsort(axis=-1).argsort(axis=-1)

array([[[2, 0, 1],
        [0, 2, 1]],

       [[1, 2, 0],
        [0, 2, 1]],

       [[2, 0, 1],
        [2, 0, 1]],

       [[1, 2, 0],
        [0, 1, 2]]])

Different direction

In [5]:
x.argsort(axis=1).argsort(axis=1)

array([[[1, 0, 0],
        [0, 1, 1]],

       [[1, 0, 0],
        [0, 1, 1]],

       [[0, 1, 0],
        [1, 0, 1]],

       [[1, 1, 0],
        [0, 0, 1]]])

<br>
<br>

In below,

a good way to insert a break in a for-loop or a big function; press Enter to resume

In [None]:
input()

<br>
<br>

In below,

empty dictionary or list is a dangerous default value in Python

In [1]:
def f(value, key, hash={}):
    hash[value] = key
    return hash

print(f('a', 1))
print(f('b', 2))

{'a': 1}
{'a': 1, 'b': 2}


<br>
<br>

In below,

`locals()` inside of a function will return a dict of local variables

In [1]:
def localsNotPresent():
    return locals()

def localsPresent():
    present = True
    return locals()

print('localsNotPresent:', localsNotPresent())
print('localsPresent:', localsPresent())

localsNotPresent: {}
localsPresent: {'present': True}


In [36]:
import re
import math

def get_db_friendly_col_name(col_name, tbl_prefix='dim_table_1_'):
    """Assume database table names must be at most 30 characters;
    reduce column name cleverly into a descriptive shorthand"""
    # clean the column name to use for the database table name
    col_name = re.sub(r'[\&\/\s]+', '_', col_name).lower()
    print(col_name)

    col_name = re.sub(r'and\_', '', col_name) # remove "and_"
    print(col_name)

    # summarise words in the column name if further reduction is needed
    col_name_words = re.findall(r'[^\_]+', col_name)
    print(col_name_words)

    if len(col_name) > 30 - len(tbl_prefix):
        # remove vowels if they do not start the word
        col_name = '_'.join([re.sub(r'(?<=.)[aeiou]', '', c) for c in col_name_words])
        print(col_name)

        if len(col_name) > 30 - len(tbl_prefix):
            # truncate words as little as possible
            col_name_words = re.findall(r'[^\_]+', col_name)
            print(col_name_words)

            col_name_len_excess = len(col_name) - len(tbl_prefix)
            print(col_name_len_excess)

            col_name_word_trunc = int(math.ceil(col_name_len_excess / len(col_name_words)))
            print(col_name_word_trunc)

            col_name = '_'.join([c[:-col_name_word_trunc] for c in col_name_words])
    return col_name

In [38]:
get_db_friendly_col_name("asda&a/adf dfasfa_aaaband_bbbeex")

asda_a_adf_dfasfa_aaaband_bbbeex
asda_a_adf_dfasfa_aaabbbbeex
['asda', 'a', 'adf', 'dfasfa', 'aaabbbbeex']
asd_a_adf_dfsf_abbbbx
['asd', 'a', 'adf', 'dfsf', 'abbbbx']
9
2


'a__a_df_abbb'

<br>
<br>

In below,

in a notebook on the Kaggle platform, create a link to download the dataframe which was saved with .to_csv method

In [None]:
from IPython.display import HTML

def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe which was saved with .to_csv method
create_download_link(filename='predictions.csv')

<br>
<br>

In below,

@classmethod

The class method is always attached to a class, with the first argument as the class itself `cls`.

In [2]:
from datetime import date

# random Person
class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age

    @classmethod
    def fromBirthYear(cls, name, birthYear):
        return cls(name, date.today().year - birthYear)

    def display(self):
        print(self.name + "'s age is: " + str(self.age))

person = Person('Adam', 19)
person.display()

person1 = Person.fromBirthYear('John',  1985)
person1.display()

Adam's age is: 19
John's age is: 35


<br>

@staticmethod

In [7]:
class Student(object):

    @staticmethod
    def is_full_name(name_str):
        names = name_str.split(' ')
        return len(names) > 1

print(Student.is_full_name('Scott Robinson'))   # True
print(Student.is_full_name('Scott'))            # False

True
False


<br>
<br>

In below,

Set the cell type to `Raw NBConvert` to reserve code not to be run

<br>
<br>

In below,

An instances of a class that has a `__call__` function behaves like a function and can be called like a function.

From: https://discuss.pytorch.org/t/is-model-forward-x-the-same-as-model-call-x/33460

    forward function is called in the .__call__ function.
    __call__ is already defined in nn.Module, will register all hooks and call your forward function. 
    That’s also the reason to call the module directly (output = model(data)) instead of model.forward(data).

From: https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input

    All the hooks are dispatched in the __call__ function, so if you call .forward and have hooks in your model, the hooks won’t have any effect

In [1]:
class Product: 
    def __init__(self): 
        print("Instance Created") 
  
    # Defining __call__ method 
    def __call__(self, a, b): 
        print(a * b) 
  
# Instance created
ans = Product() 
  
# __call__ method will be called 
ans(10, 20)

Instance Created
200


<br>
<br>

A function that returns `self` will allow method cascading. 

See https://stackoverflow.com/questions/43380042/purpose-of-return-self-python 

An example is having it in a `fit()` function.

<br>
<br>

In below,

use `setattr` to assist method chaining

In [1]:
import pandas as pd

df = pd.DataFrame({
    'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
    'Price': [22000,25000,27000,35000]
})

def user_defined_function(df, arg_1, arg_2):
    print(df[arg_1].max())
    print(df.sort_values(arg_2))

setattr(pd.DataFrame, 'user_defined_function', user_defined_function)

df

Unnamed: 0,Brand,Price
0,Honda Civic,22000
1,Toyota Corolla,25000
2,Ford Focus,27000
3,Audi A4,35000


In [2]:
df.user_defined_function(arg_1="Price", arg_2="Brand")

35000
            Brand  Price
3         Audi A4  35000
2      Ford Focus  27000
0     Honda Civic  22000
1  Toyota Corolla  25000


<br>
<br>

Good source of built-in data https://scikit-learn.org/stable/datasets/index.html

make_classification( ) is a good starting point of making dummy data https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html

<br>
<br>

In below

pandas.explode

In [3]:
df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
df

Unnamed: 0,A,B
0,"[1, 2, 3]",1
1,foo,1
2,[],1
3,"[3, 4]",1


In [4]:
df.explode('A')

Unnamed: 0,A,B
0,1,1
0,2,1
0,3,1
1,foo,1
2,,1
3,3,1
3,4,1


<br>
<br>

In below,

how to add a percentile (of a column) for each row

In [8]:
df = pd.DataFrame({'A': [1, 2, 3, 3, 4, 10, 13, 17, 18, 18, 20]})
df.assign(percentile_rank=df["A"].rank(pct=True))

Unnamed: 0,A,percentile_rank
0,1,0.090909
1,2,0.181818
2,3,0.318182
3,3,0.318182
4,4,0.454545
5,10,0.545455
6,13,0.636364
7,17,0.727273
8,18,0.863636
9,18,0.863636


<br>
<br>

In below,

`groupby and agg` can work with `lambda`

In [11]:
df = pd.DataFrame({
    "a": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
    "b": ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"]
})
df

Unnamed: 0,a,b
0,foo,one
1,foo,one
2,foo,one
3,foo,two
4,bar,one
5,bar,one
6,bar,one
7,bar,two
8,foo,two
9,foo,two


In [14]:
(
    df
    .groupby("a")
    .agg({'b': lambda x: x.str.cat(sep=' ')})
    .reset_index()
)

Unnamed: 0,a,b
0,bar,one one one two
1,foo,one one one two two two one


<br>
<br>

In below,

`+` vs `append`

In [1]:
import numpy as np
a = []
a += [[1,2]]
a += [[8,9,9]] 
a

[[1, 2], [8, 9, 9]]

In [2]:
np.concatenate(a, axis=0)

array([1, 2, 8, 9, 9])

In [3]:
a = []
a += [1,2]
a += [8,9,9]
a

[1, 2, 8, 9, 9]

In [4]:
a = []
a.append([[1,2]])
a.append([[8,9,9]])
a

[[[1, 2]], [[8, 9, 9]]]

In [5]:
a = []
a.append([1,2])
a.append([8,9,9])
a

[[1, 2], [8, 9, 9]]