In [1]:
## Section A: Multiprocessing

In [2]:
#1. Write a function which accepts a positive integer n and returns the Collatz length of n.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import multiprocessing
from multiprocessing import Pool
from sklearn.model_selection import KFold, train_test_split
from sklearn.tree import DecisionTreeRegressor
from itertools import product
import numpy as np
from os import cpu_count

In [2]:
def collatz_length(n):
    if(n<=0):
        return "Input must be positive!"

    length = 0
    while n!=1:
        if n%2==0:
            n=n//2
        else:
            n=(n*3)+1
        length=length+1
    
    return length     

In [3]:
no_cores = multiprocessing.cpu_count()

In [4]:
#Personal computer has 8 cores.
#n = 10**5
#with Pool(processes = no_cores-1) as p:
#    collatz_results = p.map(collatz_length,range(1,n+1))

In [5]:
# Section B: Pruning

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

In [7]:
#Represent 'Sex' column as integers:
df['Sex'] = df.apply(lambda row:int(row['Sex']=='male'),axis=1)

In [8]:
#Selecting feature subset that will help predict survived target variable:
features = ['Pclass', 'Sex', 'Age', 'SibSp','Parch','Fare']
target = ['Survived']

In [9]:
#60/40 train test split 
df = df.dropna(subset = features+target)
x_tr,x_va,y_tr,y_va = train_test_split(df[features],df[target],test_size = 0.9,random_state=0)

In [58]:
#Finding optimal value of alpha using 3-fold cross validation:
def fit_tree(inpt):
    x_tra,y_tra,alpha = inpt
    tr = DecisionTreeRegressor(ccp_alpha=alpha)
    tr.fit(x_tra, y_tra)
    return(tr)

In [None]:
kf = KFold(n_splits=3)
kf.get_n_splits(x_tr)

dfs_acc = [] # a list to store our df_acc dataframe for each split

optimum_alphas = [] # a list to store the best alpha value for each split.
for train_index, val_index in kf.split(x_tr): # this loop is over the cross-val splits
    x_tra = x_tr.iloc[train_index]
    x_val = x_tr.iloc[val_index]
    y_tra = y_tr.iloc[train_index]
    y_val = y_tr.iloc[val_index]
    
    tr = DecisionTreeRegressor()
    path = tr.cost_complexity_pruning_path(x_tra, y_tra) # we must compute the ccp_alphas for each split
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    
    inpt = product([x_tra],[y_tra],ccp_alphas) # assembling the input for the multiprocessing distribution
    
    cores = cpu_count() # get the number of CPUs
    
    with Pool(processes = cores-1) as p:
        trees = p.map(fit_tree,inpt)

    data = [] # data for a dataframe showing the scores and attributes of each tree
    for tr in trees:
        alpha = tr.ccp_alpha
        acc_tr = tr.score(x_tra,y_tra)
        acc_va = tr.score(x_val,y_val)
        n_leaves = tr.get_n_leaves()
        depth = tr.get_depth()
        data.append({'alpha':alpha,'depth':depth,'n_leaves':n_leaves,
                     'acc_tr':acc_tr,'acc_va':acc_va})
    df_acc = pd.DataFrame(data)
    dfs_acc.append(df_acc)
    best_idx = df_acc['acc_va'].idxmax() # find the row with the best accuracy on the validation set
    best_row = df_acc.loc[best_idx]
    optimum_alphas.append(best_row['alpha'])