## Introduction to NumPy

In [1]:
import numpy as np

In [2]:
# create our first array

array1 = np.array([1,2,3,4,5])

In [3]:
array1

array([1, 2, 3, 4, 5])

In [4]:
2*array1

array([ 2,  4,  6,  8, 10])

In [5]:
sum([1,2,3,4,5])

15

In [6]:
sum(array1)

15

In [7]:
[1,2,3,4,5]+[ 2,  4,  6,  8, 10]

[1, 2, 3, 4, 5, 2, 4, 6, 8, 10]

In [8]:
array1+2*(array1)

array([ 3,  6,  9, 12, 15])

In [9]:
type(array1)

numpy.ndarray

In [10]:
array1

array([1, 2, 3, 4, 5])

In [11]:
# conditionals
# get all even numbers

array1[array1 % 2 == 0]

array([2, 4])

In [15]:
# create array from list
mylist = [1,2,3,4,5]
array1 = np.array(mylist)

In [16]:
array1

array([1, 2, 3, 4, 5])

In [19]:
# multidimensional array
mdarray = np.array([[1,2,3,4,5],[6,7,8,9,10]])

In [20]:
# indexing
print(mdarray[1,1])
print(mdarray[0,0])

7
1


In [21]:
# reshape
mdarray_reshaped = mdarray.reshape(1,10)

In [22]:
mdarray

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [23]:
mdarray_reshaped

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])

### NPV Example

In [24]:
net_cash_flows = [-2000,100,100,(100+2500)]
periods = [0,1,2,3]
pv_list = [net_cash_flows, periods]

In [25]:
pv_list

[[-2000, 100, 100, 2600], [0, 1, 2, 3]]

In [26]:
r = 0.10
sum([pv[0]/((1+r)**pv[1]) for pv in zip(*pv_list)])

126.97220135236626

In [27]:
# better way with np
net_cash_flows = np.array([-2000,100,100,(100+2500)])
periods = np.array([0,1,2,3])

In [28]:
sum(net_cash_flows/((1+r)**periods))

126.97220135236626

In [29]:
pv_array = np.array([net_cash_flows,periods])

In [30]:
pv_array

array([[-2000,   100,   100,  2600],
       [    0,     1,     2,     3]])

In [31]:
sum(pv_array[0]/((1+r)**pv_array[1]))

126.97220135236626

In [32]:
type(pv_array)

numpy.ndarray

In [33]:
### Software Services Industry

roles       = np.array(["Programmer","Project Manager","Business Analyst","QA Analyst","UX Designer"])
daily_rates = np.array([8500,14000,10000,8500,8500])
man_days    = np.array([20,20,10,10,10])

In [34]:
project_cost = daily_rates * man_days
project_cost

array([170000, 280000, 100000,  85000,  85000])

In [35]:
project_cost.sum()

720000

In [36]:
roles[1]

'Project Manager'

In [37]:
print(roles[1]+": "+str(project_cost[1]))

Project Manager: 280000


In [38]:
### numpy.select
### numpy.select(condlist, choicelist, default=0)[source]
### Return an array drawn from elements in choicelist, depending on conditions.

x = np.arange(10)

np.select([x%2==0, x%3==0], [x**2, x//3])

array([ 0,  0,  4,  1, 16,  0, 36,  0, 64,  3])

### Example: Sales Forecast

In [39]:
opportunities = ["TNVS System Rearchitecture","OTT-ABC Corporation"]
amounts = [600000,1500000]

In [40]:
opportunities = np.array(opportunities)
amounts = np.array(amounts)

In [41]:
opportunities

array(['TNVS System Rearchitecture', 'OTT-ABC Corporation'], dtype='<U26')

In [42]:
amounts

array([ 600000, 1500000])

In [43]:
list(amounts[amounts > 1000000])

[1500000]

### Introduction to Pandas

DataFrames are the workhorse of Pandas and are directly inspired by the R programming language.

In [44]:
import pandas as pd

In [45]:
from numpy.random import randn
np.random.seed(101)

In [46]:
df = pd.DataFrame(randn(5,4),index="A B C D E".split(),columns="W X Y Z".split())

In [47]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [48]:
# Selection and Indexing

Here are some various methods to grab data from a DataFrame

In [50]:
df["W"]

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [51]:
# You can also pass a list of column names
df[["W","Z"]]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [53]:
# Dataframe columns are just series
type(df["W"])

pandas.core.series.Series

In [54]:
df['new'] = df['W'] + df['Y']

In [55]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [56]:
# Remove column
df.drop('new',axis=1) # 1 means column

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [57]:
# Not inplace unless specified; still has new
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [58]:
# Need to specify in-place drop
df.drop('new',axis=1,inplace=True)

In [59]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [60]:
df.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [62]:
# Select rows
df['X']

A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [63]:
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [64]:
df.iloc[2] # Row C, index starts with 0

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [65]:
df.loc['B','Y'] # Row B, Column Y

-0.8480769834036315

### Conditional Selection

An important feature of Pandas is conditional selection using bracket notation, very similar to Numpy;

In [69]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [70]:
df > 0 # end up with a Boolean DataFrame

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [71]:
# Use as a filter
# All negative values will yield a NaN

df[df>0] 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [72]:
# Can also use on particular columns

df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [73]:
df[df['W']>0]['Y']

A    0.907969
B   -0.848077
D   -0.933237
E    2.605967
Name: Y, dtype: float64