# Generators

In [1]:
# Theres always top level syntax or underscore method
#    x()        -->       __call__

def add1(x, y):
    return x + y

class Adder:
    def __call__(self, x, y):
        return x + y
add2 = Adder()

**What is the difference between 'add1'  and 'add2' ???**

In [4]:
# from the outside:
print('add1:', add1(1, 2))
print('add2:', add2(1, 2))

add1: 3
add2: 3


From the outside there's no difference. 

In [11]:
from time import time

# timer decorator
def timer(func):
    def wrapper(*a, **kw):
        b4 = time()
        rv = func(*a, **kw) # rv : return value
        aft = time()
        print('elapsed:', aft - b4, 's')
        return rv
    return wrapper

In [12]:
from time import sleep

# we have some code which just sleeps half a second every time it provides a return value.
# return value will be just a number form 0 to 9
@timer
def compute():
    rv = []
    for i in range(10):
        sleep(.5)
        rv.append(i)
    return rv

In [13]:
# How long does it take to run if we call compute()??
compute()

elapsed: 5.005658388137817 s


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

How long does it take to run if we only care about the first value?
\-**ALSO 5 sec**. So this function always takes teh same amount of time and 
memory, regardless of which elements we'd like to inspect. Let's make a class object of the same compute thingy

In [18]:
class Compute:
    def __iter__(self):
        self.last = 0
        return self
    
    def __next__(self):
        rv = self.last
        self.last += 1
        if self.last > 10:
            raise StopIteration()
        sleep(1)
        return rv

In [19]:
for val in Compute():
    print(val)

0
1
2
3
4
5
6
7
8
9


This was generally a generator but its hard to read and ugly. **Generator syntax in Python:**

In [27]:
def compute():
    for i in range(10):
        sleep(.5)
        yield i

In [28]:
for val in compute():
    print(val)

0
1
2
3
4
5
6
7
8
9


In [30]:
# lets say we have an API (application programming interface):
class Api:
    def run_this_first(self):
        first()
    def run_this_second(slef):
        second()
    def run_this_last(self):
        last()

In this above case the user can use any of the methods in any order since they are defined in the class object with no restrictions. But if we use generator 'yield' it will give a control  to the user since it does not continue before user is initated a new call:

In [31]:
def api():
    first()
    yield
    second()
    yield
    last()

# the yield generator will force the sequence on the user. 
# You can guarantee that te last method will be never called before first 
# and second method.

## Generators for EDA
There are datasets which have 50+ features. Each feature has to be inspected to determine its properties:<br>
* type
* distribution
* usefulness
* other insights
It is very tedious to do it manually and there is not a good <code>for loop</code> for it. On of the possibilities is to use <code>generator</code> function that plots feature as a:
* histogram (+ additional e.g. pdf/cdf) - if it is continuous numerical
* boxplot if it is a discrete (categorical) feature. :<br>

In [2]:
import pandas as pd

#sample data
house = pd.read_csv('data/iowa_house_prices/train.csv', header=0)
house.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [None]:
#clears the output
from IPython.display import clear_output

#plots boxplots if feature is categorical and 
#histogram if it is continuous
def inspect_feature(dfs, feature, target, bins=50):
    '''Feature distribution inspector. If feature continuous plots it as a histogram
    and if discrete as a boxplot vs the target feature.
    
    Parameters
    ----------
    df : DataFrame
        DataFrame to be inspected.
    feature : str
        Feature name to be investigated.
    target : str
        Target name.
    bins : int
        Number of bins if the feature is discrete.
    
    Returns
    -------
    None
    '''
    clear_output() #clear the output
    
    #info about NaN values:
    print("'{}' NaN-s: {}".format(feature, df[feature].isna().sum()))
    
    """based on the number of unique values determine
    if the feature is continuous or discrete. The 
    threshold for uniques is 20. Such an implementation
    is never 100% accurate but works most of the time."""
    
    #number of unique values
    n_uniques = len(df[feature].unique())
    
    #feature is considered to be discrete if it is type
    #'object' or there are less than 20 unique values
    if df[feature].dtype == 'O' or n_uniques < 20:
        #output value counts of the feature
        print(df[feature].value_counts(normalize=True, dropna=False))
    
        #output training df box plots vs target
        pass
        plot_box(df, feature, target)
    else:
        #if feature is continuous a histogram of 
        #the training and test set data is plotted on the same canvas
        data = pd.concat([dfs[0].drop(target, axis='columns'), dfs[1]], axis='rows')
        plot_hist(df=data, feature=feature, bins=bins) 