In [198]:
import numpy as np
import pandas as pd
from scipy.stats import binned_statistic

In [199]:
def get_mean(series):
    return series.mean()

In [200]:
def get_midrange(series):
    return (series.min() + series.max()) / 2

In [201]:
def get_mode(series):
    return series.mode().values

In [202]:
def get_modality(mode_arr):
    count = mode_arr.size
    if count == 1:
        return "Unimodal"
    elif count == 2:
        return "Bimodal"
    elif count == 3:
        return "Trimodal"
    else:
        return "Multimodal"

In [203]:
def five_num_summ(series):
    summ = pd.Series(0.0, index=['min', 'lowq', 'med', 'upq', 'max'])
    summ['min'] = series.min()
    summ['lowq'] = series.quantile(0.25)
    summ['med'] = series.median()
    summ['upq'] = series.quantile(0.75)
    summ['max'] = series.max()
    
    return summ

In [204]:
def eq_depth_bin(series, depth, by='none'):
    if len(series) % depth != 0:
        return None
    num_bins = int(len(series) / depth)
    sorted_series = series.sort_values().values
    binned = pd.DataFrame(np.zeros((num_bins, depth)))
    for i in range(num_bins):
        if by == 'boundary':
            beg_bound = sorted_series[depth * i]
            end_bound = sorted_series[depth * i + (depth - 1)]
            
        for j in range(depth):
            curr_val = sorted_series[depth * i + j]
            
            if by == 'boundary':
                if abs(curr_val - beg_bound) <= abs(curr_val - end_bound):
                    binned.loc[i, j] = beg_bound
                else:
                    binned.loc[i, j] = end_bound
            else:
                binned.loc[i, j] = curr_val
            
        if by == 'mean':
            binned.loc[i] = binned.loc[i].mean()
    
    return binned

In [None]:
def eq_width_by_med_bin(series):

In [205]:
def q1_data_statistics(data):   
    print("Q1. Data Statistics")
    for col in data.loc[:, ['C', 'D', 'E', 'F']]:
        series = data[col]
        mean = get_mean(series)
        midrange = get_midrange(series)
        mode = get_mode(series)
        if mode.size == series.size:
            modality = "No mode"
            mode = []
        else:
            modality = get_modality(mode)

        summary = five_num_summ(series)
        print("Column: {}".format(col))
        print("    Mean: {:.4f}".format(mean))
        print("    Midrange: {:.4f}".format(midrange))
        print("    Mode: [ ", end='')
        for val in mode:
            print("{:.4f} ".format(val), end='')
        print("]")
        print("    Modality: {}".format(modality))
        print("    Five Number Summary:")
        for (ind, item) in summary.iteritems():
            print("        {}: {:.4f}".format(ind, item))
        print("\n")

In [206]:
def q2_smoothing(data):
    col = data.loc[:, 'F'].copy()
    binned_mean = eq_depth_bin(col, 100, 'mean')
    binned_bound = eq_depth_bin(col, 100, 'boundary')
    
    print(binned_mean)
    print(binned_bound)

In [207]:
#file_name = input("Enter name of file containing data: ")
file_name = "hwk01.csv"
df = pd.read_csv(file_name)

In [208]:
q1_data_statistics(df)
q2_smoothing(df)

Q1. Data Statistics
Column: C
    Mean: 5184.6630
    Midrange: 5037.0000
    Mode: [ 589.0000 6930.0000 ]
    Modality: Bimodal
    Five Number Summary:
        min: 78.0000
        lowq: 2795.5000
        med: 5180.0000
        upq: 7557.2500
        max: 9996.0000


Column: D
    Mean: -0.0348
    Midrange: -0.0138
    Mode: [ ]
    Modality: No mode
    Five Number Summary:
        min: -3.2984
        lowq: -0.7186
        med: -0.0454
        upq: 0.6769
        max: 3.2708


Column: E
    Mean: 15.4567
    Midrange: 19.1057
    Mode: [ ]
    Modality: No mode
    Five Number Summary:
        min: -16.4951
        lowq: 8.5649
        med: 15.4411
        upq: 22.4839
        max: 54.7065


Column: F
    Mean: 5.9200
    Midrange: 6.0000
    Mode: [ 6.0000 ]
    Modality: Unimodal
    Five Number Summary:
        min: 1.0000
        lowq: 3.0000
        med: 6.0000
        upq: 9.0000
        max: 11.0000


      0      1      2      3      4      5      6      7      8      9   