Kaito Minami

## Problem 4 [Matrix transpose and inverse] 20 points
\[C](C) Generate at random 3 matrices of size 3 × 3 and fill each entry with a random integer chosen from -10 to 10. Use an existing package to compute the inverse of each matrix if it exists.

Include in your report:
 - Each of the 3 matrices
 - The inverses computed with the package
 - Compute the product of each matrix with its inverse to check that you obtain the identity matrix.

In [1]:
import numpy as np
import random as rand

In [2]:
mat1 = np.zeros((3,3))
mat2 = np.zeros((3,3))
mat3 = np.zeros((3,3))

for mat in [mat1, mat2, mat3]:
    for i in range(mat.shape[0]):
        for j in range(mat.shape[1]):
            mat[i][j] = rand.randint(-10,10)
        
print(mat1)
print(mat2)
print(mat3)

[[ 3. -2.  6.]
 [ 4. -1.  6.]
 [ 2. -3.  4.]]
[[-8.  4. 10.]
 [-5.  6. -6.]
 [-3. -8.  8.]]
[[-10.   2.  -2.]
 [ -6.   8.  -2.]
 [ -5.   8. -10.]]


In [3]:
""" 
A = [[a,b],[c,d]]
A^-1 = 1/(ad-bc) [[d, -b],[-c, a]]
A^-1 = (1/|A|)Adj A
"""

def find_inverse(mat):
    """ find the inverse matrix of given matrix
    mat (Numpy array): input matrix
    returns:: inverse matrix of input matrix
    """
    # for convenience, assume 3x3 shape
    iv_mat = np.zeros(mat.shape)
    det = 0.0
    odd = True
    
    for i in range(mat.shape[0]):
        col = list(range(mat.shape[0]))
        col.remove(i)
        if i%2 == 0:
            det += mat[0][i] * ((mat[1][col[0]] * mat[2][col[1]]) - (mat[1][col[1]] * mat[2][col[0]]))
        else:
            det -= mat[0][i] * ((mat[1][col[0]] * mat[2][col[1]]) - (mat[1][col[1]] * mat[2][col[0]]))
            
    if det == 0:
        print('This matrix does not have inverse')
        return 0

    for i in range(iv_mat.shape[0]):
        col = list(range(mat.shape[0]))
        col.remove(i)
        for j in range(iv_mat.shape[1]):
            row = list(range(mat.shape[1]))
            row.remove(j)
            if odd:
                sign = 1.0
            else:
                sign = -1.0
            
            iv_mat[i][j] = (sign/det) * ((mat[row[0]][col[0]] * mat[row[1]][col[1]]) - (mat[row[0]][col[1]] * mat[row[1]][col[0]]))
            odd = not odd

    print(mat)
    return iv_mat

def multiply_matrix(mat1, mat2):
    """ Multiplies and returns the result of two matrices multiplication
    mat1: Matrix 1
    mat2: Matrix 2
    result:: Result of Matrix 1 and Matrix 2 multiplication
    """
    # for convenience, assume 3x3
    mul = np.zeros((mat2.shape[0], mat1.shape[1]))
    idx = 0
    jdx = 0
    val = 0
    count = 0
    
    while count < mat2.shape[0] * mat1.shape[1]:
        for j in range(mat2.shape[1]):
            val += mat1[idx][j] * mat2[j][jdx]
        mul[idx][jdx] = val
        val = 0
        jdx += 1
        count += 1
        
        if count%mat2.shape[0] == 0:
            idx += 1
        if jdx == mat1.shape[1]:
            jdx = 0
        
    return mul

def scaling_matrix(mat):
    """ Scaling matrix to achieve identity or ranked matrix
    mat (Numpy array): input matrix
    returns:: scaled input matrix
    """
    # for convenience, assume 3x3
    for i in range(mat.shape[0]):
        mat[0][i] /= mat[0][0]
        mat[1][i] -= mat[0][i] * mat[1][i]
        mat[2][i] -= mat[0][i] * mat[2][i]
        
    for i in range(mat.shape[0]):
        mat[1][i] /= mat[1][1]
        mat[0][i] -= mat[1][i] * mat[0][i]
        mat[2][i] -= mat[1][i] * mat[2][i]
        
    for i in range(mat.shape[0]):
        mat[2][i] /= mat[2][2]
        mat[0][i] -= mat[2][i] * mat[0][i]
        mat[1][i] -= mat[2][i] * mat[1][i]
        
    return mat

def computational_result(mat):
    return multiply_matrix(mat, find_inverse(mat))

In [4]:
find_inverse(mat1)

[[ 3. -2.  6.]
 [ 4. -1.  6.]
 [ 2. -3.  4.]]


array([[-1.4,  1. ,  0.6],
       [ 0.4, -0. , -0.6],
       [ 1. , -0.5, -0.5]])

In [6]:
find_inverse(mat2)

[[-8.  4. 10.]
 [-5.  6. -6.]
 [-3. -8.  8.]]


array([[ 0.        , -0.13793103, -0.10344828],
       [ 0.07142857, -0.04187192, -0.12068966],
       [ 0.07142857, -0.09359606, -0.03448276]])

In [8]:
find_inverse(mat3)

[[-10.   2.  -2.]
 [ -6.   8.  -2.]
 [ -5.   8. -10.]]


array([[-0.11510791,  0.00719424,  0.02158273],
       [-0.08992806,  0.1618705 , -0.01438849],
       [-0.01438849,  0.12589928, -0.12230216]])

In [10]:
computational_result(mat1)

[[ 3. -2.  6.]
 [ 4. -1.  6.]
 [ 2. -3.  4.]]


array([[ 1.0000000e+00,  0.0000000e+00,  4.4408921e-16],
       [-8.8817842e-16,  1.0000000e+00,  4.4408921e-16],
       [ 0.0000000e+00,  0.0000000e+00,  1.0000000e+00]])

In [11]:
scaling_matrix(computational_result(mat1))

[[ 3. -2.  6.]
 [ 4. -1.  6.]
 [ 2. -3.  4.]]


array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [12]:
computational_result(mat2)

[[-8.  4. 10.]
 [-5.  6. -6.]
 [-3. -8.  8.]]


array([[ 1.00000000e+00,  0.00000000e+00, -5.55111512e-17],
       [ 0.00000000e+00,  1.00000000e+00,  2.77555756e-17],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])

In [13]:
scaling_matrix(computational_result(mat2))

[[-8.  4. 10.]
 [-5.  6. -6.]
 [-3. -8.  8.]]


array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [14]:
computational_result(mat3)

[[-10.   2.  -2.]
 [ -6.   8.  -2.]
 [ -5.   8. -10.]]


array([[ 1.00000000e+00,  5.55111512e-17, -2.77555756e-17],
       [-4.16333634e-17,  1.00000000e+00,  0.00000000e+00],
       [-1.11022302e-16,  2.22044605e-16,  1.00000000e+00]])

In [15]:
scaling_matrix(computational_result(mat3))

[[-10.   2.  -2.]
 [ -6.   8.  -2.]
 [ -5.   8. -10.]]


array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

## [C] Problem 5 [Average, variance, and correlation] 20 points
Dataset: The dataset for this assignment is available here. The prediction task is to predict the
price of a house (column price) given the other features. Please ignore the columns id and date, as well as the categorical column zipcode. File “kc house data.csv” includes all the records in the dataset. You should use the entire dataset for the assignment. You can also find a Word document including the feature description in the same folder.

In this problem, we will perform some exploratory data analysis using the house price dataset.

(a) For each feature, write code to compute the average value, the min and max values, as well as its variance.
Which features have the lowest and the highest average? Include the feature name and their average values for the features with the lowest and highest average.
Which features have the lowest and the highest variance? Include the feature name and their variance values for the features with the lowest and highest variance.

In [16]:
import pandas as pd

In [17]:
kc_house = pd.read_csv('hw1_data/kc_house_data.csv')
# print(kc_house.head())

# Create your own mean, max, min, variation, correlation
# print(kc_house.describe())
# print('Variables: \n', kc_house.var(numeric_only = True))

In [18]:
def get_avg(df):
    """ get average value stats for each columns in input DataFrame
    df (DataFrame): data
    returns:: dictionary of average value stats for each columns
    """
    col = list(df.columns)
    avg_dic = dict()
    summed = 0
    for c in col:
        for i in range(df.shape[0]):
            if isinstance(df.loc[i, c], str):
                summed = 'null'
            else:
                summed += df.at[i, c]
        if summed != 'null':
            avg_dic[c] = summed / df.shape[0]
        summed = 0
    return avg_dic

def get_min(df):
    """ get minimum value stats for each columns in input DataFrame
    df (DataFrame): data
    returns:: dictionary of minimum value stats for each columns
    """
    col = list(df.columns)
    min_dic = dict()
    
    for c in col:
        min_place = df.at[0, c]
        for i in range(df.shape[0]):
            if min_place > df.at[i, c]:
                min_place = df.at[i, c]
        if isinstance(min_place, str) == False:
            min_dic[c] = min_place
    
    return min_dic

def get_max(df):
    """ get maximum value stats for each columns in input DataFrame
    df (DataFrame): data
    returns:: dictionary of maximum value stats for each columns
    """
    col = list(df.columns)
    max_dic = dict()
    
    for c in col:
        max_place = df.at[0, c]
        for i in range(df.shape[0]):
            if max_place < df.at[i, c]:
                max_place = df.at[i, c]
        if isinstance(max_place, str) == False:
            max_dic[c] = max_place
    
    return max_dic

def get_var(df):
    """ get variance value stats for each columns in input DataFrame
    df (DataFrame): data
    returns:: dictionary of variance value stats for each columns
    """
    col = list(df.columns)
    avg = get_avg(df)
    var_dic = dict()
    summed = 0
    
    for c in col:
        if c in avg.keys():
            for i in range(df.shape[0]):
                summed += (df.at[i,c] - avg[c])**2
            var_dic[c] = summed / (df.shape[0] - 1)
            summed = 0
    
    return var_dic

def find_highlow(dic):
    """ get the highest and lowest value index out of dictionary
    dic (dict): dictionary of key (str) and value (int or float)
    returns:: list with highest and lowest value index [highest, lowest]
    """
    val = list(dic.values())
    min_place = val[0]
    max_place = val[0]
    i = 0
    j = 0
    k = 0
    for v in val:
        if v > max_place:
            max_place = v
            i = k
        if v < min_place:
            min_place = v
            j = k
        k += 1
    return [i,j]

In [19]:
get_avg(kc_house)

{'id': 4580301520.864988,
 'price': 540088.1417665294,
 'bedrooms': 3.37084162309721,
 'bathrooms': 2.1147573219821405,
 'sqft_living': 2079.8997362698374,
 'sqft_lot': 15106.967565816869,
 'floors': 1.4943089807060566,
 'waterfront': 0.007541757275713691,
 'view': 0.23430342849211122,
 'condition': 3.4094295100171195,
 'grade': 7.656873178179799,
 'sqft_above': 1788.3906907879516,
 'sqft_basement': 291.5090454818859,
 'yr_built': 1971.0051357978994,
 'yr_renovated': 84.40225790033776,
 'zipcode': 98077.93980474715,
 'lat': 47.56005251931704,
 'long': -122.21389640494083,
 'sqft_living15': 1986.552491556008,
 'sqft_lot15': 12768.455651691113}

In [20]:
get_min(kc_house)

{'id': 1000102,
 'price': 75000.0,
 'bedrooms': 0,
 'bathrooms': 0.0,
 'sqft_living': 290,
 'sqft_lot': 520,
 'floors': 1.0,
 'waterfront': 0,
 'view': 0,
 'condition': 1,
 'grade': 1,
 'sqft_above': 290,
 'sqft_basement': 0,
 'yr_built': 1900,
 'yr_renovated': 0,
 'zipcode': 98001,
 'lat': 47.1559,
 'long': -122.519,
 'sqft_living15': 399,
 'sqft_lot15': 651}

In [21]:
get_max(kc_house)

{'id': 9900000190,
 'price': 7700000.0,
 'bedrooms': 33,
 'bathrooms': 8.0,
 'sqft_living': 13540,
 'sqft_lot': 1651359,
 'floors': 3.5,
 'waterfront': 1,
 'view': 4,
 'condition': 5,
 'grade': 13,
 'sqft_above': 9410,
 'sqft_basement': 4820,
 'yr_built': 2015,
 'yr_renovated': 2015,
 'zipcode': 98199,
 'lat': 47.7776,
 'long': -121.315,
 'sqft_living15': 6210,
 'sqft_lot15': 871200}

In [22]:
get_var(kc_house)

{'id': 8.274629486057814e+18,
 'price': 134782378397.24681,
 'bedrooms': 0.8650150097573724,
 'bathrooms': 0.5931512887355798,
 'sqft_living': 843533.6813681519,
 'sqft_lot': 1715658774.1754541,
 'floors': 0.29158800687709074,
 'waterfront': 0.007485225502689098,
 'view': 0.5872426169774596,
 'condition': 0.42346651239404876,
 'grade': 1.3817032893475767,
 'sqft_above': 685734.6672685045,
 'sqft_basement': 195872.66840094145,
 'yr_built': 862.7972621659717,
 'yr_renovated': 161346.2118623043,
 'zipcode': 2862.7878348129493,
 'lat': 0.01919990179600803,
 'long': 0.019832622017890593,
 'sqft_living15': 469761.23994532274,
 'sqft_lot15': 745518225.3404043}

In [23]:
avg = get_avg(kc_house)
highlow = find_highlow(avg)

print('Highest average is: ', list(avg.keys())[highlow[0]], list(avg.values())[highlow[0]])
print('Lowest average is: ', list(avg.keys())[highlow[1]], list(avg.values())[highlow[1]])

Highest average is:  id 4580301520.864988
Lowest average is:  long -122.21389640494083


In [24]:
var = get_var(kc_house)
highlow = find_highlow(var)

print('Highest variance is: ', list(var.keys())[highlow[0]], list(var.values())[highlow[0]])
print('Lowest variance is: ', list(var.keys())[highlow[1]], list(var.values())[highlow[1]])

Highest variance is:  id 8.274629486057814e+18
Lowest variance is:  waterfront 0.007485225502689098


(b) Compute the correlation coefficient of each feature with the response. Include a table with the correlation coefficient of each feature with the response. Which features are positively correlated (i.e., have positive correlation coefficient) with the response? Which feature has the highest positive correlation with the response?

In [25]:
def get_corr(df, response):
    """ get correlation value stats for each columns relative to response column in input DataFrame
    df (DataFrame): data
    response (str): a column name for response/target variable
    returns:: dictionary of correlation value stats for each columns relative to response column
    """
    col = list(df.columns)
    corr_dic = dict()
    
    avg = get_avg(df)
    y_hat = avg[response]
    y_hats = sum([(y - y_hat)**2 for y in list(df[response])])
    
    for c in col:
        sigma_xy = 0
        if c in avg.keys() and c != response:
            x_hat = avg[c]
            x_hats = sum([(x - x_hat)**2 for x in list(df[c])])
            
            for i in range(df.shape[0]):
                sigma_xy += (df.at[i, c] - x_hat) * (df.at[i, response] - y_hat)
            
            corr_dic[c] = sigma_xy / ((x_hats * y_hats)**(1/2))
    
    return corr_dic

def find_pos(dic):
    """ get the positive value index out of dictionary
    dic (dict): dictionary of key(str) and value(int or float)
    returns:: list of positive value index
    """
    val = list(dic.values())
    pos = []
    k = 0
    
    for v in val:
        if v > 0:
            pos.append(k)
        k += 1
    
    return pos

In [26]:
get_corr(kc_house, 'price')

{'id': -0.016762196614445113,
 'bedrooms': 0.3083495981456364,
 'bathrooms': 0.5251375054139724,
 'sqft_living': 0.7020350546118009,
 'sqft_lot': 0.08966086058710003,
 'floors': 0.25679388755070176,
 'waterfront': 0.26636943403055346,
 'view': 0.3972934882944871,
 'condition': 0.03636178912899409,
 'grade': 0.667434256020255,
 'sqft_above': 0.6055672983560842,
 'sqft_basement': 0.323816020712004,
 'yr_built': 0.05401153149478604,
 'yr_renovated': 0.12643379344092243,
 'zipcode': -0.05320285429832495,
 'lat': 0.3070034799952177,
 'long': 0.021626241039307104,
 'sqft_living15': 0.5853789035795697,
 'sqft_lot15': 0.08244715251948594}

In [27]:
corr = get_corr(kc_house, 'price')
pos = find_pos(corr)

print('Positive correlation:\n')
print({list(corr.keys())[i]: list(corr.values())[i] for i in pos})

Positive correlation:

{'bedrooms': 0.3083495981456364, 'bathrooms': 0.5251375054139724, 'sqft_living': 0.7020350546118009, 'sqft_lot': 0.08966086058710003, 'floors': 0.25679388755070176, 'waterfront': 0.26636943403055346, 'view': 0.3972934882944871, 'condition': 0.03636178912899409, 'grade': 0.667434256020255, 'sqft_above': 0.6055672983560842, 'sqft_basement': 0.323816020712004, 'yr_built': 0.05401153149478604, 'yr_renovated': 0.12643379344092243, 'lat': 0.3070034799952177, 'long': 0.021626241039307104, 'sqft_living15': 0.5853789035795697, 'sqft_lot15': 0.08244715251948594}


In [28]:
pos_corr = {list(corr.keys())[i]: list(corr.values())[i] for i in pos}
high = find_highlow(pos_corr)[0]

print('Highest positive correlation: ', list(pos_corr.keys())[high], list(pos_corr.values())[high])

Highest positive correlation:  sqft_living 0.7020350546118009


(c) Were you able to find any features with a negative correlation coefficient with the response?

In [29]:
neg = list(range(len(list(corr.values()))))
[neg.remove(i) for i in pos][0]

print('Negative correlation:\n')
print({list(corr.keys())[i]: list(corr.values())[i] for i in neg})

Negative correlation:

{'id': -0.016762196614445113, 'zipcode': -0.05320285429832495}
