# Numpy & Pandas basics

## Numpy
1. Numpy arrays
2. Basic operations

## Pandas
1. Loading tabular data into dataframe
2. Prevewing data
3. Selecting & Querying
4. Modifying dataframe
5. Group by

# Numpy

In [None]:
import numpy as np

## 1. Numpy arrays

In [None]:
# 1-dimensional numpy array
a_1d = np.array([1,2,3]) # slightly different from column vector & row vector
print(a_1d)
print(type(a_1d))
print(a_1d.shape)
print(a_1d[0], a_1d[-1])

a_1d[2] = 9
print(a_1d)

# 2-dimensional numpy array
a_2d = np.array([[1,2,3]]) # you can regard it as a row vector
print(a_2d)
print(type(a_2d))
print(a_2d.shape)

b_2d = np.array([[1],[2],[3]]) # you can regard it as a column vector
print(b_2d)
print(b_2d.shape)

c_2d = np.array([[1,2,3], [5,6,7]]) # you can regard it as a matrix
print(c_2d)
print(c_2d.shape)
print(c_2d[1,2])

In [None]:
print(a_2d)
print('-----------------')
print(a_2d.T)
print('-----------------')
print(a_2d.reshape( [3,1] ))
print('-----------------')
print('-----------------')
print(c_2d)
print('-----------------')
print(c_2d.T)
print('-----------------')
print(c_2d.reshape(-1))
print('-----------------')
print(c_2d.reshape( [3,2] ))

In [None]:
d_2d = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

print(d_2d)
print('-----------------')
print(d_2d.sum(axis=0))
print(d_2d.sum(axis=1))
print('-----------------')
print(d_2d[:2, 3])
print('-----------------')
print(d_2d>6)
print('-----------------')
print(d_2d[ d_2d>6   ])
print('-----------------')
d_2d[d_2d>6] = 100
print(d_2d)

## 2. Basic operations

In [None]:
# element-wise operation

x = np.array([[1, 2], [3, 4]])
y = np.array([[5, 6], [7, 8]])
print(x)
print(y)
print('add -----------------')
print( x + y )
print(np.add(x, y))
print('subtract -----------------')
print( x - y )
print(np.subtract(x, y))
print('multiply -----------------')
print( x * y )
print(np.multiply(x, y))
print('divide -----------------')
print( x / y )
print(np.divide(x, y))
print('power&sqrt -----------------')
print( x ** y )
print(np.sqrt(x))

In [None]:
# matrix multiplication
X = np.array([[1, 2], [3, 4]])
Y = np.array([[5, 6], [7, 8]])
print(X)
print(Y)

print('XY -----------------')
print(np.matmul(X,Y))
print('YX -----------------')
print(np.matmul(Y,X))

In [None]:
# (+Optional)Broadcasting
d_2d = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

v = np.array([4,4,4,4])
z = np.array([3,3,3]).reshape([3,1])
print(d_2d)
print('--------------')
print(v)
print(z)
print('d_2d + v --------------')
print(d_2d + v)
print('d_2d + z --------------')
print(d_2d + z)

In [None]:
# Numpy array to list
x = np.array([1,2,3,4,5,6,7,8])
print(x)
print(type(x))
print(' array-->list ')
z = x.tolist()
print(z)
print(type(z))


# Pandas

In [None]:
import pandas as pd

## 1. Loading tabular data into dataframe

In [None]:
url = 'https://raw.githubusercontent.com/RayleighKim/Example_datasets/master/Graduate_apply.csv'

df = pd.read_csv(url)

df.head()

## 2. Previewing data

In [None]:
# Show first 5 rows
df.head()

In [None]:
# Show last 3 rows
df.tail(3)

In [None]:
# show the shape of a dataframe
df.shape

In [None]:
# show the names of columns
df.columns

In [None]:
# show a concise summary of a dataframe
df.info()

In [None]:
# show basic descriptive statistics
df.describe()

In [None]:
# Dataframe --> numpy array
df.values

## 3. Selecting & Querying

In [None]:
# select specific columns
# df['admit']  # this generate pandas 'series', we don't use this 
df[['admit']]

In [None]:
df[['admit', 'gre']]

In [None]:
# integer location
print(df.iloc[1])
print('-----------------')
print(df.iloc[1:3])
print('-----------------')
print(df.iloc[1:3, 1])
print('-----------------')
print(df.iloc[1:3, 1:3])

In [None]:
# location I
df.loc[1:3, ['gre', 'rank']]

In [None]:
# location II
condition = df['gre'] > 780
df.loc[ condition, ['gre', 'rank'] ].head()

In [None]:
# location III
condition = df['gre'].isin([710,720,730,740])
df.loc[ condition, ['gre', 'rank'] ].head()

## 4. Modifying data frames

In [None]:
df_origin = df.copy()

In [None]:
print(df.head(1))

# modifying specific value
df.loc[0, 'gre'] = 780

print(df.head(1))

In [None]:
df = df_origin
# drop specific columns
df = df.drop(['gre', 'gpa'], axis=1)
df.head()

In [None]:
df = df_origin
# making a new column
df['gregpa'] = df['gre']/800 + df['gpa']/4
df.head()

In [None]:
# Dummy variable

df = df_origin
df = pd.get_dummies(df, columns=['rank'])
print(df.head())

df = df_origin
df = pd.get_dummies(df, columns=['rank'], drop_first=True)
print(df.head())

## 5. Group by

In [None]:
df = pd.read_csv(url)
df.head()

In [None]:
# group by operation
temp = df.groupby(by=['rank'], as_index = False)['gre'].mean()

print(temp)

temp.columns = ['rank', 'avg(gre)']

print(temp)

In [None]:
# group by operation
temp = df.groupby(by=['rank'], as_index = False)['gre', 'gpa'].mean()

temp.columns = ['rank', 'avg(gre)', 'avg(gpa)']

temp