In [3]:
from IPython import get_ipython
import math
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
import scipy.stats as st
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import paired_distances
from sklearn.preprocessing import MinMaxScaler
import sys

#pd.set_eng_float_format(accuracy=4, use_eng_prefix=True)

In [4]:
file_name = input("Enter name of file containing data: ")
df = pd.read_csv(file_name)

Enter name of file containing data: hwk01.csv


In [5]:
def get_mean(series):
    return series.mean()

In [6]:
def get_midrange(series):
    return (series.min() + series.max()) / 2

In [7]:
def get_mode(series):
    return series.mode().values

In [8]:
def get_modality(mode_arr):
    count = mode_arr.size
    if count == 1:
        return "Unimodal"
    elif count == 2:
        return "Bimodal"
    elif count == 3:
        return "Trimodal"
    else:
        return "Multimodal"

In [9]:
def five_num_summ(series):
    summ = pd.Series(0.0, index=['min', 'lowq', 'med', 'upq', 'max'])
    summ['min'] = series.min()
    summ['lowq'] = series.quantile(0.25)
    summ['med'] = series.median()
    summ['upq'] = series.quantile(0.75)
    summ['max'] = series.max()
    
    return summ

In [10]:
def q1_data_statistics(data):   
    print("Q1. Data Statistics")
    for col in data.loc[:, ['C', 'D', 'E', 'F']]:
        series = data[col]
        mean = get_mean(series)
        midrange = get_midrange(series)
        mode = get_mode(series)
        if mode.size == series.size:
            modality = "No mode"
            mode = []
        else:
            modality = get_modality(mode)

        summary = five_num_summ(series)
        print("Column: {}".format(col))
        print("    Mean: {:.4f}".format(mean))
        print("    Midrange: {:.4f}".format(midrange))
        print("    Mode: [ ", end='')
        for val in mode:
            print("{:.4f} ".format(val), end='')
        print("]")
        print("    Modality: {}".format(modality))
        print("    Five Number Summary:")
        for (ind, item) in summary.iteritems():
            print("        {}: {:.4f}".format(ind, item))

In [11]:
q1_data_statistics(df)

Q1. Data Statistics
Column: C
    Mean: 5184.6630
    Midrange: 5037.0000
    Mode: [ 589.0000 6930.0000 ]
    Modality: Bimodal
    Five Number Summary:
        min: 78.0000
        lowq: 2795.5000
        med: 5180.0000
        upq: 7557.2500
        max: 9996.0000
Column: D
    Mean: -0.0348
    Midrange: -0.0138
    Mode: [ ]
    Modality: No mode
    Five Number Summary:
        min: -3.2984
        lowq: -0.7186
        med: -0.0454
        upq: 0.6769
        max: 3.2708
Column: E
    Mean: 15.4567
    Midrange: 19.1057
    Mode: [ ]
    Modality: No mode
    Five Number Summary:
        min: -16.4951
        lowq: 8.5649
        med: 15.4411
        upq: 22.4839
        max: 54.7065
Column: F
    Mean: 5.9200
    Midrange: 6.0000
    Mode: [ 6.0000 ]
    Modality: Unimodal
    Five Number Summary:
        min: 1.0000
        lowq: 3.0000
        med: 6.0000
        upq: 9.0000
        max: 11.0000


In [12]:
def eq_depth_bin(series, depth, by='none'):
    if len(series) % depth != 0:
        return None
    
    num_bins = int(len(series) / depth)
    sorted_series = series.sort_values().values
    binned = pd.DataFrame(np.zeros((num_bins, depth)))
    for i in range(num_bins):
        if by == 'boundary':
            beg_bound = sorted_series[depth * i]
            end_bound = sorted_series[depth * i + (depth - 1)]
            
        for j in range(depth):
            curr_val = sorted_series[depth * i + j]
            
            if by == 'boundary':
                if abs(curr_val - beg_bound) <= abs(curr_val - end_bound):
                    binned.loc[i, j] = beg_bound
                else:
                    binned.loc[i, j] = end_bound
            else:
                binned.loc[i, j] = curr_val
            
        if by == 'mean':
            binned.loc[i] = binned.loc[i].mean()
                            
    return pd.Series(np.reshape(binned.values, len(series)))

In [13]:
def eq_width_by_med_bin(series, bins=10):
    sorted_series = series.sort_values().values
    stats, bin_edges, binnums = st.binned_statistic(sorted_series, sorted_series, 'median', bins)
    sorted_series = sorted_series.astype(float)
    for i in range(len(sorted_series)):
        sorted_series[i] = stats[binnums[i] - 1]
        
    final_series = pd.Series(sorted_series).sort_values()
    final_series.index = range(len(sorted_series))
    return final_series

In [14]:
def q2_smoothing(data):
    print("Q2. Smoothing on column F")
    col = data.loc[:, 'F'].copy()
    binned_mean = eq_depth_bin(col, 100, 'mean')
    binned_bound = eq_depth_bin(col, 100, 'boundary')
    width_binned_med = eq_width_by_med_bin(col, 10)
    
    print("Column F:")
    print(col)
    print("Equal-depth binning by mean:")
    print(binned_mean)
    print("Equal-depth binning by boundary:")
    print(binned_bound)
    print("Equal-width binning by median:")
    print(width_binned_med)

In [15]:
q2_smoothing(df)

Q2. Smoothing on column F
Column F:
0       6
1       7
2       3
3       1
4       2
5       9
6       3
7       9
8       8
9       4
10     10
11      3
12      4
13      5
14      7
15      5
16      6
17      3
18     10
19      1
20      1
21      4
22      2
23      9
24      3
25      1
26      4
27      7
28      2
29      6
       ..
970     7
971     4
972     3
973    10
974     5
975     3
976     1
977     5
978     2
979     1
980     3
981     7
982     3
983     8
984     1
985     1
986     3
987     8
988     4
989     4
990     9
991     9
992    10
993     5
994    10
995     1
996     7
997     5
998     7
999     5
Name: F, Length: 1000, dtype: int64
Equal-depth binning by mean:
0       1.06
1       1.06
2       1.06
3       1.06
4       1.06
5       1.06
6       1.06
7       1.06
8       1.06
9       1.06
10      1.06
11      1.06
12      1.06
13      1.06
14      1.06
15      1.06
16      1.06
17      1.06
18      1.06
19      1.06
20      1.06
21      1.06
22 

In [16]:
def minmax_norm(series, scale_range):
    scaler = MinMaxScaler(scale_range)
    normalized = scaler.fit_transform(series.astype(float))
    normalized = normalized.reshape(-1)
    return pd.Series(normalized)

In [17]:
def zscore_norm(series):
    return pd.Series(st.zscore(series))

In [18]:
def dec_scale_norm(series):
    max_val = series.max()
    dec_num = int(math.log(max_val, 10)) + 1
    return series / 10 ** dec_num

In [19]:
def q3_normalization(data):
    print("Q3. Normalization on column C")
    minmax = minmax_norm(data.loc[:, 'C'].values.reshape(-1, 1), (-1, 1))
    zscore = zscore_norm(data.loc[:, 'C'])
    decnorm = dec_scale_norm(data.loc[:, 'C'])
    
    print("Column C:")
    print(data.loc[:, 'C'])
    print("Min-Max normalization:")
    print(minmax.round(4))
    print("Z-Score normalization:")
    print(zscore.round(4))
    print("Decimal scaling normalization:")
    print(decnorm.round(4))

In [20]:
q3_normalization(df)

Q3. Normalization on column C
Column C:
0       964
1      5342
2      7265
3      5787
4      2889
5      8005
6      7867
7      9957
8       876
9      3433
10     6208
11     1282
12     7901
13      413
14     5785
15     8454
16     4361
17     3582
18     2653
19      598
20     9996
21     3041
22      996
23     3712
24      649
25      758
26     7216
27     5914
28     3931
29     2338
       ... 
970    7145
971    9677
972    2477
973    5591
974     715
975    1013
976    6110
977    8180
978    3827
979     589
980     168
981    3597
982    3842
983    9973
984    2426
985    6363
986    9733
987    4482
988    2935
989    1037
990    4457
991    5942
992    4389
993    1635
994    7237
995     462
996    7285
997    8728
998    2143
999    1301
Name: C, Length: 1000, dtype: int64
Min-Max normalization:
0     -0.8213
1      0.0615
2      0.4493
3      0.1512
4     -0.4332
5      0.5985
6      0.5707
7      0.9921
8     -0.8391
9     -0.3235
10     0.2361
11    -0.7572
1

In [21]:
def pca_reduce(df, cols, p):
    pca = PCA(n_components=p)
    reduced = pca.fit_transform(df.loc[:, cols])
    
    new_cols = ["p{}".format(i + 1) for i in range(p)]
    new_df = pd.DataFrame(reduced, columns=new_cols)
    return new_df

In [22]:
def q4_reduction(data):
    print("Q4. Data reduction on columns C, D, E, and F")
    reduced = pca_reduce(data, ['C', 'D', 'E', 'F'], 2)
    
    print("Columns C, D, E, and F:")
    print(data.loc[:, ['C', 'D', 'E', 'F']].round(4))
    print("PCA reduced to 2 columns:")
    print(reduced.round(4))

In [23]:
q4_reduction(df)

Q4. Data reduction on columns C, D, E, and F
Columns C, D, E, and F:
        C       D        E   F
0     964 -0.5886   9.7552   6
1    5342 -1.3554  31.3374   7
2    7265 -0.3449   0.6031   3
3    5787 -0.7290  10.1490   1
4    2889  1.1961  21.8565   2
5    8005  1.2350  21.4052   9
6    7867 -1.5103  18.2359   3
7    9957 -0.6635   2.3715   9
8     876  1.3090  13.3939   8
9    3433  0.8349   7.0054   4
10   6208 -1.2269  28.4236  10
11   1282  0.6342  30.1564   3
12   7901 -1.0383   3.1018   4
13    413 -1.3286  12.7588   5
14   5785  0.7806  20.7937   7
15   8454 -0.2354  18.4600   5
16   4361 -1.7060  13.3747   6
17   3582 -0.5630   8.6505   3
18   2653  0.3862  12.2411  10
19    598 -0.5315  21.2430   1
20   9996 -1.9680  28.2145   1
21   3041 -0.5483  30.6519   4
22    996  1.2087   9.0080   2
23   3712  0.6086   6.3613   9
24    649  1.7471   2.7456   3
25    758  0.0977  20.3141   1
26   7216 -1.5288  -5.1068   4
27   5914  1.0135  24.7603   7
28   3931 -0.6481  31.9692   2
2

In [35]:
def min_dist(df, tup, version="euclidean", norm=False):
    dist = sys.maxsize
    min_row = 0
    for row in df.itertuples():
        if norm:
            tup = np.array(tup).reshape(1, -1)
            nd_row = np.array(norm_point(row[4:])).reshape(1, -1)
        else:
            nd_row = np.array(row[4:]).reshape(1, -1)
        if version == "supremum":
            curr_dist = cdist(tup, nd_row, "chebyshev")[0][0]
        else:
            curr_dist = paired_distances(tup, nd_row, metric=version)[0]
        
        if curr_dist < dist:
            dist = curr_dist
            min_row = row[0]
            
    return (dist, df.iloc[min_row])

In [25]:
def norm_point(tup):
    tot = 0
    normed = list(tup)
    for i in range(len(tup)):
        tot += (tup[i] ** 2)
    
    for i in range(len(normed)):
        normed[i] = normed[i] ** 2
        normed[i] /= tot
    
    tot = 0
    for i in normed:
        tot += i
        
    return normed

In [38]:
def q5_similarity_distance(data):
    print("Q5. Similarity and distance on columns C, D, E, and F")
    a, b, c, d, e, f = input("Enter a 6-tuple separated by spaces: ").split(' ')
    print()
    tup = (a, b, int(c), float(d), float(e), int(f))
    tup = np.array(tup[2:]).reshape(1, -1)
    
    dist, min_row = min_dist(data, tup)
    print("Min row for euclidean distance:")
    print("Distance: {:.4f}".format(dist))
    print(min_row)
    print()
    
    dist, min_row = min_dist(data, tup, "manhattan")
    print("Min row for manhattan distance:")
    print("Distance: {:.4f}".format(dist))
    print(min_row)
    print()
    
    dist, min_row = min_dist(data, tup, "cosine")
    print("Min row for cosine distance:")
    print("Distance: {:.4f}".format(dist))
    print(min_row)
    print()
    
    wrong_norm = norm_point(min_row[3:])
    wrong_name = min_row.name
    
    dist, min_row = min_dist(data, tup, "supremum")
    print("Min row for supremum distance:")
    print("Distance: {:.4f}".format(dist))
    print(min_row)
    print()
    
    tup = tup.reshape(4)
    norm = norm_point(tup)
    
    dist, min_row = min_dist(data, norm, norm=True)
    print("Normalized point:")
    print("[{p[0]:.4f} {p[1]:.4f} {p[2]:.4f} {p[3]:.4f}]".format(p=norm))
    print("Min row for normalized euclidean distance:")
    print("Distance: {:.4f}".format(dist))
    print("{} [{p[0]:.4f} {p[1]:.4f} {p[2]:.4f} {p[3]:.4f}]".format(min_row.name, p=norm_point(min_row[3:])))
    print("{} [{p[0]:.4f} {p[1]:.4f} {p[2]:.4f} {p[3]:.4f}]".format(wrong_name, p=wrong_norm))

In [39]:
q5_similarity_distance(df)

Q5. Similarity and distance on columns C, D, E, and F
Enter a 6-tuple separated by spaces: a1 b2 420 1.3980 13.3939 15

Min row for euclidean distance:
Distance: 7.4597
Unnamed: 0         621
A                   a3
B                   b1
C                  424
D             0.461535
E              17.1048
F                   10
Name: 621, dtype: object

Min row for manhattan distance:
Distance: 13.6474
Unnamed: 0         621
A                   a3
B                   b1
C                  424
D             0.461535
E              17.1048
F                   10
Name: 621, dtype: object

Min row for cosine distance:
Distance: 0.0001
Unnamed: 0        448
A                  a2
B                  b1
C                 274
D             1.16647
E             11.5122
F                  10
Name: 448, dtype: object

Min row for supremum distance:
Distance: 5.0000
Unnamed: 0         621
A                   a3
B                   b1
C                  424
D             0.461535
E              17.

In [28]:
def q6_correlation(data):
    print("Q6. Correlation")
    print("Covariance of numeric columns:")
    print(data[['C', 'D', 'E', 'F']].cov().round(4))
    print()
    
    print("Correlation coefficient of numeric columns:")
    print(data[['C', 'D', 'E', 'F']].corr().round(4))
    print()
    
    contin_tab = pd.crosstab(data['A'], data['B'])
    stats = st.chi2_contingency(contin_tab)
    contin_tab['total'] = contin_tab.sum(axis=1)
    contin_tab = contin_tab.append(contin_tab.sum(axis=0).rename("total"))
    print("Contingency table:")
    print(contin_tab)
    print()
    
    conf = 0.001
    val = st.chi2.ppf(q=(1 - conf), df=stats[2])
    
    print("Observed: {:.4f}".format(stats[0]))
    print("Critical: {:.4f}".format(val))
    if stats[0] <= val:
        print("Null hypothesis not rejected (values not correlated)")
    else:
        print("Null hypothesis rejected (values are correlated)")

In [29]:
q6_correlation(df)

Q6. Correlation
Covariance of numeric columns:
              C        D         E        F
C  8.145881e+06  50.3941 -421.3179  99.6507
D  5.039410e+01   1.0032   -0.6001   0.0524
E -4.213179e+02  -0.6001  101.0113  -0.2313
F  9.965070e+01   0.0524   -0.2313   9.7473

Correlation coefficient of numeric columns:
        C       D       E       F
C  1.0000  0.0176 -0.0147  0.0112
D  0.0176  1.0000 -0.0596  0.0167
E -0.0147 -0.0596  1.0000 -0.0074
F  0.0112  0.0167 -0.0074  1.0000

Contingency table:
B       b1   b2  total
A                     
a1     174  132    306
a2     170  151    321
a3     247  126    373
total  591  409   1000

Observed: 13.4633
Critical: 13.8155
Null hypothesis not rejected (values not correlated)
