In [6]:
import pandas as pd
import numpy as np

In [7]:
#Compute the euclidean distance between series (points) p and q, without using a packaged formula
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# Euclidean distance formula for higher dimensions
distance = np.sqrt(((p-q) **2).sum())
print(distance)

18.16590212458495


In [11]:
# Change the order of columns of a dataframe. Interchange columns 'a' and 'c'
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)
interchanged_df = df[['c', 'b', 'a', 'd', 'e']]
print(df)

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
    c   b   a   d   e
0   2   1   0   3   4
1   7   6   5   8   9
2  12  11  10  13  14
3  17  16  15  18  19


In [22]:
# Change the order of columns of a dataframe. Create a generic function to interchange two columns, without
# hardcoding column names

# swap columns takes in three parameters, the dataframe, then the two columns to be switched.
def swap_columns(df, col1, col2):
    # Get the list of columns in the DataFrame
    columns = list(df.columns)
    
    # Get the indices of the columns to swap
    idx1, idx2 = columns.index(col1), columns.index(col2)
    
    # Swap the columns in the list
    columns[idx1], columns[idx2] = columns[idx2], columns[idx1]
    
    # Reorder the DataFrame columns based on the modified list
    return df[columns]

df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))

print(swap_columns(df, 'a', 'c'))

    c   b   a   d   e
0   2   1   0   3   4
1   7   6   5   8   9
2  12  11  10  13  14
3  17  16  15  18  19


In [43]:
"""
    Format or suppress scientific notations in a pandas dataframe. Suppress scientific notations like ‘e-03’ in df and
    print upto 4 numbers after decimal.
    df
    #> random
    #> 0 3.474280e-03
    #> 1 3.951517e-05
    #> 2 7.469702e-02
    #> 3 5.541282e-28
    Desired Output
    #> random
    #> 0 0.0035
"""
df = pd.DataFrame(np.random.random(4)**10, columns=['random'])
pd.options.display.float_format = '{:.4f}'.format
print(df)

   random
0  0.0011
1  0.0182
2  0.0000
3  0.0013


In [50]:
# Create the DataFrame
df = pd.DataFrame(np.random.randint(1, 100, 40).reshape(10, -1),
                  columns=list('pqrs'), index=list('abcdefghij'))

def find_nearest_row(df):
    distances = []
    nearest_rows = []
    for i in range(len(df)):
        distance_between_currentRow = np.linalg.norm(df.values - df.values[i], axis = 1)

        #Each point is a distance of 0 to itself, so we should set it to some arbitrary large number
        distance_between_currentRow[i] = 1000000000000000000
        
        # Find the index of the nearest row and the corresponding distance      
        nearest_index = np.argmin(distance_between_currentRow)
        nearest_distance = distance_between_currentRow[nearest_index]



        # Append the results to the lists
        nearest_rows.append(df.index[nearest_index])
        distances.append(nearest_distance)

    df['nearest_row'] = nearest_rows
    df['dist'] = distances
    return df

# Apply the function
df = find_nearest_row(df)

# Display the updated DataFrame
print(df)

    p   q   r   s nearest_row    dist
a  58  41  88  21           c 17.6918
b  35   8  70   1           a 48.3942
c  60  57  90  28           a 17.6918
d  20  93  58  71           j 52.9056
e  12  50  15  19           h 34.0147
f  32  46  92  88           d 61.6279
g  83   4  27  12           b 65.4981
h  30  42  40  31           e 34.0147
i  93  44  91  54           c 43.9886
j   5  67  21  48           e 34.8569


In [52]:
"""
Question 6 (15 Points)
Correlation is a statistical technique that shows how two variables are related. Pandas dataframe.corr() method is
used for creating the correlation matrix. It is used to find the pairwise correlation of all columns in the dataframe.
Any na values are automatically excluded. For any non-numeric data type columns in the dataframe it is ignored.

This isn't a question, what are we supposed to do?

"""

data = {'A': [45, 37, 0, 42, 50],
'B': [38, 31, 1, 26, 90],
'C': [10, 15, -10, 17, 100],
'D': [60, 99, 15, 23, 56],
'E': [76, 98, -0.03, 78, 90]
       }
df = pd.DataFrame(data)
print(df.corr())

       A      B      C      D      E
A 1.0000 0.7560 0.6244 0.4938 0.9286
B 0.7560 1.0000 0.9667 0.3636 0.6496
C 0.6244 0.9667 1.0000 0.2189 0.5281
D 0.4938 0.3636 0.2189 1.0000 0.7202
E 0.9286 0.6496 0.5281 0.7202 1.0000
