# Module 8: A Slice of NumPy and Advanced Data Wrangling

### NumPy and 1D Arrays

In [30]:
import numpy as np
import pandas as pd

In [3]:
my_list = [1, 2, 3, 4]
my_list

[1, 2, 3, 4]

In [4]:
my_array = np.array((1, 2, 3, 4))
my_array

array([1, 2, 3, 4])

In [5]:
my_list = [1, "hi"]

In [7]:
my_array = np.array((1, "hi"))
my_array

array(['1', 'hi'], dtype='<U21')

Array cannot contain different element types.

In [9]:
my_array = np.array([1, 2, 3, 4])
my_array

array([1, 2, 3, 4])

In [10]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
np.arange(5)

array([0, 1, 2, 3, 4])

In [12]:
np.arange(0, 10, 2)

array([0, 2, 4, 6, 8])

In [13]:
np.linspace(1,5,10)

array([1.        , 1.44444444, 1.88888889, 2.33333333, 2.77777778,
       3.22222222, 3.66666667, 4.11111111, 4.55555556, 5.        ])

In [14]:
np.random.rand(5)

array([0.77089068, 0.9736534 , 0.76169731, 0.76910574, 0.04696362])

In [15]:
array1 = np.ones(4)
array1

array([1., 1., 1., 1.])

In [17]:
array2 = array1 + 1
array2

array([2., 2., 2., 2.])

In [18]:
array1 + array2

array([3., 3., 3., 3.])

In [19]:
array1 * array2

array([2., 2., 2., 2.])

Can't add lists together

In [20]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [21]:
arr[7]

7

In [22]:
arr[2:6]

array([2, 3, 4, 5])

In [23]:
arr[-1]

9

In [24]:
grade_array = np.array([98,87,103,92,67,107,78,104,85,105])
grade_array

array([ 98,  87, 103,  92,  67, 107,  78, 104,  85, 105])

In [25]:
threshold = np.array([98,87,103,92,67,107,78,104,85,105]) > 100
threshold

array([False, False,  True, False, False,  True, False,  True, False,
        True])

In [26]:
grade_array[threshold] = 100
grade_array

array([ 98,  87, 100,  92,  67, 100,  78, 100,  85, 100])

In [27]:
new_grade_array = np.array([98,87,103,92,67,107,78,104,85,105])

In [28]:
new_grade_array[new_grade_array > 100] = 100
new_grade_array

array([ 98,  87, 100,  92,  67, 100,  78, 100,  85, 100])

In [31]:
cereal = pd.read_csv('https://raw.githubusercontent.com/UBC-MDS/programming-in-python-for-data-science/master/data/cereal.csv', nrows=5)
cereal.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,100% Natural Bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,All-Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
3,All-Bran with Extra Fiber,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912
4,Almond Delight,R,Cold,110,2,2,200,1,14,8,1,25,3,1,0.75,34.384843


In [32]:
type(cereal.loc[3, 'calories'])

numpy.int64

In [33]:
cereal['calories'].to_numpy()

array([ 70, 120,  70,  50, 110])

In [34]:
np.pi

3.141592653589793

In [35]:
np.inf

inf

In [36]:
np.prod([2,3,1])

6

In [37]:
np.diff([2,5,20])

array([ 3, 15])

In [38]:
np.log10(100)

2.0

### Multi Dimensional Arrays

In [39]:
list_2d = [[1,2], [3,4], [5,6]]
array_2d = np.array(list_2d)
array_2d

array([[1, 2],
       [3, 4],
       [5, 6]])

In [40]:
np.zeros((3,4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [41]:
np.random.rand(4,2)

array([[0.70141283, 0.14460961],
       [0.58015794, 0.2757587 ],
       [0.96054629, 0.00972301],
       [0.84955202, 0.25142074]])

In [42]:
np.arange(0,12).reshape(3,4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [46]:
array1 = np.ones(4)
array1

array([1., 1., 1., 1.])

In [44]:
array1.ndim

1

In [45]:
array1.shape

(4,)

In [47]:
array1.size

4

In [48]:
array_2d = np.ones((3,2))
array_2d

array([[1., 1.],
       [1., 1.],
       [1., 1.]])

In [50]:
array_2d.ndim

2

In [51]:
array_2d.size

6

In [52]:
array_2d.shape

(3, 2)

In [53]:
len(array_2d.shape)

2

In [54]:
np.prod(array_2d.shape)

6

In [55]:
arr2 = np.arange(0,12).reshape(3,4)
arr2

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [56]:
arr2[1,2]

6

In [57]:
arr2[1][2]

6

In [58]:
arr2[2]

array([ 8,  9, 10, 11])

In [59]:
arr2[:,2]

array([ 2,  6, 10])

In [60]:
arr2[:2,1:]

array([[1, 2, 3],
       [5, 6, 7]])

In [61]:
arr2.T

array([[ 0,  4,  8],
       [ 1,  5,  9],
       [ 2,  6, 10],
       [ 3,  7, 11]])

In [62]:
arr2[1,1] = 77777
arr2

array([[    0,     1,     2,     3],
       [    4, 77777,     6,     7],
       [    8,     9,    10,    11]])

### Working with Null Values

In [63]:
np.nan

nan

In [64]:
cereal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      5 non-null      object 
 1   mfr       5 non-null      object 
 2   type      5 non-null      object 
 3   calories  5 non-null      int64  
 4   protein   5 non-null      int64  
 5   fat       5 non-null      int64  
 6   sodium    5 non-null      int64  
 7   fiber     5 non-null      int64  
 8   carbo     5 non-null      int64  
 9   sugars    5 non-null      int64  
 10  potass    5 non-null      int64  
 11  vitamins  5 non-null      int64  
 12  shelf     5 non-null      int64  
 13  weight    5 non-null      int64  
 14  cups      5 non-null      float64
 15  rating    5 non-null      float64
dtypes: float64(2), int64(11), object(3)
memory usage: 768.0+ bytes


In [65]:
cereal['cups'].isnull()

0    False
1    False
2    False
3    False
4    False
Name: cups, dtype: bool

In [66]:
cereal[cereal['cups'].isnull()]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating


In [67]:
cereal[cereal.isnull().any(axis=1)]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating


In [69]:
na_removed = cereal.dropna()
na_removed

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,100% Natural Bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,All-Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
3,All-Bran with Extra Fiber,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912
4,Almond Delight,R,Cold,110,2,2,200,1,14,8,1,25,3,1,0.75,34.384843


In [77]:
cereal.dropna(subset=['cups']); # drop na from specific column

In [78]:
cereal_zero_fill = cereal.fillna(value=0) # replace nan with 0

In [73]:
cereal_mean_fill = cereal.fillna(value=cereal['cups'].mean().round(2))

In [75]:
cereal.fillna(method='bfill'); # uses next valid row observation to fill na

In [79]:
cereal.fillna(method='ffill'); # ffill has the NaN value adopt the same value and the row that precedes it.

### Working with Dates & Time

In [95]:
ubers = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/uber-rides-data3.csv", nrows=50)
ubers.head()

Unnamed: 0,Date/Time,Lat,Lon
0,2014-08-08 15:45:00,40.7141,-73.9827
1,2014-08-08 15:46:00,40.6916,-73.9845
2,2014-08-08 15:46:00,40.6463,-73.7895
3,2014-08-08 15:46:00,40.6632,-73.7223
4,2014-08-08 15:46:00,40.7556,-73.991


In [89]:
dates = (ubers['Date/Time'].str.split(' ', expand=True).rename(columns={0: 'Date',
                                                                        1: 'Time'}))
dates.head()

Unnamed: 0,Date,Time
0,2014-08-08,15:45:00
1,2014-08-08,15:46:00
2,2014-08-08,15:46:00
3,2014-08-08,15:46:00
4,2014-08-08,15:46:00


In [90]:
dates = (dates['Date'].str.split('-', expand=True).rename(columns = {0: 'Year',
                                                                     1: 'Month',
                                                                     2: 'Day'}))
dates.head()

Unnamed: 0,Year,Month,Day
0,2014,8,8
1,2014,8,8
2,2014,8,8
3,2014,8,8
4,2014,8,8


In [91]:
dates.iloc[0,1]

'08'

In [92]:
ubers_dates = (ubers.assign(Year = dates['Year'].astype(int),
                            Month = dates['Month'],
                            Day = dates['Day'].astype(int)))
ubers_dates.head(3)

Unnamed: 0,Date/Time,Lat,Lon,Year,Month,Day
0,2014-08-08 15:45:00,40.7141,-73.9827,2014,8,8
1,2014-08-08 15:46:00,40.6916,-73.9845,2014,8,8
2,2014-08-08 15:46:00,40.6463,-73.7895,2014,8,8


In [93]:
uber_dates = ubers_dates.loc[:, ['Year', 'Month', 'Day', 'Lat', 'Lon']]
uber_dates.head(3)

Unnamed: 0,Year,Month,Day,Lat,Lon
0,2014,8,8,40.7141,-73.9827
1,2014,8,8,40.6916,-73.9845
2,2014,8,8,40.6463,-73.7895


In [94]:
uber_dates.sort_values(['Year', 'Month', 'Day'])

Unnamed: 0,Year,Month,Day,Lat,Lon
261435,2014,08,1,40.7325,-73.9876
261436,2014,08,1,40.6754,-74.0170
261437,2014,08,1,40.7303,-74.0029
261438,2014,08,1,40.7218,-73.9973
261439,2014,08,1,40.7134,-74.0091
...,...,...,...,...,...
1511437,2014,09,30,40.7668,-73.9845
1511438,2014,09,30,40.6911,-74.1773
1511439,2014,09,30,40.8519,-73.9319
1511440,2014,09,30,40.7081,-74.0066


In [97]:
uber = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/uber-rides-data3.csv", nrows = 50,
                   parse_dates = ['Date/Time'])

In [98]:
uber.head(3)

Unnamed: 0,Date/Time,Lat,Lon
0,2014-08-08 15:45:00,40.7141,-73.9827
1,2014-08-08 15:46:00,40.6916,-73.9845
2,2014-08-08 15:46:00,40.6463,-73.7895


In [101]:
uber.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
dtype: object

In [103]:
uber.sort_values('Date/Time').head()

Unnamed: 0,Date/Time,Lat,Lon
0,2014-08-08 15:45:00,40.7141,-73.9827
10,2014-08-08 15:46:00,40.7439,-73.9836
9,2014-08-08 15:46:00,40.7125,-73.9388
7,2014-08-08 15:46:00,40.7452,-73.9821
6,2014-08-08 15:46:00,40.7389,-73.9829


To combine month, day, year into one column:

In [104]:
# pd.read_csv("",
#            parse_dates={'Date': ['Year', 'Month', 'Day', 'Clock']})

**To convert date column to date type:**

new_cycling = cycling.assign(Date = pd.to_datatime(cycling['Date']))


**Find weekdays:**

new_cycling['Date'].dt.day_name().head()
new_cycling.assign(weekday = new_cycling['Date'].dt.day_name())

**Find days:**

new_cycling['Date'].dt.day.head()

new_cycling.assign(day = new_cycling['Date'].dt.day())


**Timestamp examples:**

timestamp_ex = new_cycling.loc[1,'Date']

timestamp_ex.month_name()

timestamp_ex.day

timestamp_ex.hour

**.diff():**

cycling_intervals = new_cycling['Date'].sort_values().diff()

**timedelta**

cycling_intervals[1]

cycling_intervals[1].seconds

cycling_intervals[1].seconds / 3600

cycling_intervals.max()

cycling_intervals.min()

interval_range = cycling_intervals.max() - cycling_intervals.min()

### Working with Strings

In [105]:
instrument = 'Violin'
instrument

'Violin'

In [106]:
instrument.upper()

'VIOLIN'

In [107]:
instrument.count('i')

2

In [108]:
instrument.split('i')

['V', 'ol', 'n']

In [109]:
cereal.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,100% Natural Bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,All-Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
3,All-Bran with Extra Fiber,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912
4,Almond Delight,R,Cold,110,2,2,200,1,14,8,1,25,3,1,0.75,34.384843


In [110]:
upper_cereal = cereal.assign(name = cereal['name'].str.upper())
upper_cereal.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% BRAN,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,100% NATURAL BRAN,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,ALL-BRAN,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
3,ALL-BRAN WITH EXTRA FIBER,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912
4,ALMOND DELIGHT,R,Cold,110,2,2,200,1,14,8,1,25,3,1,0.75,34.384843


In [111]:
type_cereal = upper_cereal.assign(bran = upper_cereal['name'].str.count('BRAN'))
type_cereal.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,bran
0,100% BRAN,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973,1
1,100% NATURAL BRAN,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679,1
2,ALL-BRAN,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505,1
3,ALL-BRAN WITH EXTRA FIBER,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912,1
4,ALMOND DELIGHT,R,Cold,110,2,2,200,1,14,8,1,25,3,1,0.75,34.384843,0


In [112]:
upper_cereal['name'].str.split(expand=True)

Unnamed: 0,0,1,2,3
0,100%,BRAN,,
1,100%,NATURAL,BRAN,
2,ALL-BRAN,,,
3,ALL-BRAN,WITH,EXTRA,FIBER
4,ALMOND,DELIGHT,,


In [114]:
combined_cereal = cereal.assign(Calories_str = cereal['calories'].astype('str') + ' cal')
combined_cereal.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,Calories_str
0,100% Bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973,70 cal
1,100% Natural Bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679,120 cal
2,All-Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505,70 cal
3,All-Bran with Extra Fiber,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912,50 cal
4,Almond Delight,R,Cold,110,2,2,200,1,14,8,1,25,3,1,0.75,34.384843,110 cal


In [116]:
cap_cereal = upper_cereal.assign(name = upper_cereal['name'].str.capitalize())
cap_cereal.head(3)

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,100% natural bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,All-bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505


In [117]:
cap_cereal = upper_cereal.assign(name = upper_cereal['name'].str.title())
cap_cereal.head(3)

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,100% Natural Bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,All-Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505


In [118]:
"Sunshine" == " Sunshine "

False

In [119]:
string1 = " Sunshine "
new_string1 = string1.strip()
new_string1

'Sunshine'

In [120]:
"Sunshine" == new_string1

True

In [121]:
cereal[cereal['name'] == 'All-Bran']

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
2,All-Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505


In [122]:
stripped_cereal = cereal.assign(name = cereal['name'].str.strip())

In [123]:
stripped_cereal[stripped_cereal['name'] == 'All-Bran']

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
2,All-Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505


In [124]:
stripped_cereal['name'].str.strip("!").tail()

0                    100% Bran
1            100% Natural Bran
2                     All-Bran
3    All-Bran with Extra Fiber
4               Almond Delight
Name: name, dtype: object

### More Advanced String Processing

**Replace**

In [126]:
cereal_lower = cereal.assign(name = cereal['name'].str.lower())
cereal_spell = cereal_lower.assign(name = cereal_lower['name'].str.replace('brann', 'bran'))

**Contains**

In [127]:
cereal_lower['name'].str.contains('bran')

0     True
1     True
2     True
3     True
4    False
Name: name, dtype: bool

In [128]:
cereal_lower[cereal_lower['name'].str.contains('bran')]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,100% natural bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,all-bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
3,all-bran with extra fiber,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912


In [130]:
cereal_lower.loc[cereal_lower['name'].str.contains('bran'), 'name'] = 'Bran'
cereal_lower

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,Bran,N,Cold,70,4,1,130,10,5,6,280,25,3,1,0.33,68.402973
1,Bran,Q,Cold,120,3,5,15,2,8,8,135,0,3,1,1.0,33.983679
2,Bran,K,Cold,70,4,1,260,9,7,5,320,25,3,1,0.33,59.425505
3,Bran,K,Cold,50,4,0,140,14,8,0,330,25,3,1,0.5,93.704912
4,almond delight,R,Cold,110,2,2,200,1,14,8,1,25,3,1,0.75,34.384843
