# AIDM7330 Basic Programming for Data Science

# Pandas


## Install Pandas package

In [2]:
# Install required packages using pip package manager in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install seaborn



## Import Pandas

In [3]:
# import packages

import pandas as pd

# Extra packages
import numpy as np
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for plotting and styling


## Part 1
### Simple creation and manipulation of Pandas objects
**Key Points:** Pandas has two / three main data types:
* Series (similar to numpy arrays, but with index)
* DataFrames (table or spreadsheet with Series in the columns) [important!]
* Panels (3D version of DataFrame, not as common)

### A Series object in Pandas

Like an np.array, but we can combine data types and it has its own index

Note: Every column in a DataFrame is a Series


In [None]:
s = pd.Series([1,3,5,np.nan,6,8])  # sth. like a list; np.nan = missing data ,can be number or text
print(s)#left is index
print(type(s))

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
<class 'pandas.core.series.Series'>


In [None]:
s[3] # got index

nan

### A data frame object in Pandas

We use `pd.DataFrame(**parameters**)` and can insert almost any data type as an argument

**Function:** `pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)`

Input data can be a numpy ndarray (structured or homogeneous), Dictionary, List, or DataFrame.

In [6]:
# initialize list of lists
data = [['tom', 10], ['nick', 15], ['juli', 14]]

# Create the pandas DataFrame
df0 = pd.DataFrame(data, columns = ['Name', 'Age'])
df0 #name and age are not data

Unnamed: 0,Name,Age
0,tom,10
1,nick,15
2,juli,14


In [None]:
dictionary = {'label': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'value': [1, 2, 3, 4, 5, 6]}

df1 = pd.DataFrame(dictionary)
df1

Unnamed: 0,label,value
0,A,1
1,B,2
2,C,3
3,A,4
4,B,5
5,C,6


In [None]:
df1['label']

0    A
1    B
2    C
3    A
4    B
5    C
Name: label, dtype: object

In [None]:
df1['label'].str.lower()

0    a
1    b
2    c
3    a
4    b
5    c
Name: label, dtype: object

In [None]:
df1['value'].sum()

21

In [None]:
# Apply aggregates across numerical entries:
df1.groupby('label').sum()

Unnamed: 0_level_0,value
label,Unnamed: 1_level_1
A,5
B,7
C,9


In [7]:
# more complex data types

s1 = 1
s2 = pd.Timestamp('20130102')
s3 = pd.Series(1,index=list(range(4)),dtype='float32')
s4 = np.arange(0,4) # one range
s5 = pd.Categorical(["test","train","test","train"])
s6 = 'foo'
dates = pd.date_range('20130101', periods=4)

In [8]:
dict1 = {'A' : s1,
         'B' : s2,
         'C' : s3,
         'D' : s4,
         'E' : s5,
         'F' : s6 }
print(dict1)

{'A': 1, 'B': Timestamp('2013-01-02 00:00:00'), 'C': 0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32, 'D': array([0, 1, 2, 3]), 'E': ['test', 'train', 'test', 'train']
Categories (2, object): ['test', 'train'], 'F': 'foo'}


In [9]:
# Convert to a pandas dataframe
df2 = pd.DataFrame(dict1,        #the dictionary is the data
                   index=dates)  #the dates will become the elements of the first column,
                                 #i.e. the index
df2 #Something missing??

Unnamed: 0,A,B,C,D,E,F
2013-01-01,1,2013-01-02,,0,test,foo
2013-01-02,1,2013-01-02,,1,train,foo
2013-01-03,1,2013-01-02,,2,test,foo
2013-01-04,1,2013-01-02,,3,train,foo


In [10]:
df2.dtypes # "df" is the name of the dataframe;

A             int64
B    datetime64[ns]
C           float32
D             int64
E          category
F            object
dtype: object

In [None]:
df2 = df2.rename(columns = {'F':'hahahah'})
df2

In [None]:
df2.describe() # Only numerical columns

In [None]:
df2.info() #Important for data exploration

# Part 2: An example: The stock market

Data source: https://www.nasdaq.com/symbol/csv/historical


## Importing "CSV" data

In [12]:
# Install the library on your environment
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=48a68b2cc3754c809c187c9ff5494cc58ab816e10881f9d403395cc9c84fd4b1
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [13]:
# Mount and setup the Google Drive path
from google.colab import drive
drivePath = '/content/drive' #please do not change
drive.mount(drivePath)

Mounted at /content/drive


In [14]:
# Import the library
import wget

# Setup URL and path variables
baseURL = 'https://raw.githubusercontent.com/pmengoni/AIDM7330-2223S1/main/'
doc = 'HistoricalQuotes_2008-2018_googl.csv'
fullURL = baseURL + doc

dataPath = drivePath + '/MyDrive/Colab Notebooks/data'

# Download the file
fileName = wget.download(fullURL, out=dataPath)

# Print the file name including the local path
print(fileName)

/content/drive/MyDrive/Colab Notebooks/data/HistoricalQuotes_2008-2018_googl (1).csv


In [15]:
### We can download data from the web by using pd.read_csv
# A CSV file is a comma seperated file
# We can use this 'pd.read_csv' method with urls that host csv files

# source: https://www.nasdaq.com/symbol/csv/historical

#we are working in code folder
#our data is saved in data folder
#need to use ../data/ to change directory

dfg = pd.read_csv(fileName)#data frame google

In [16]:
print(type(dfg))
dfg.head(10) # show first n values convert from csv to panda,the index create automatically

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,date,close,volume,open,high,low
0,9/19/2018,22.1,142544,22.2,22.42,22.0
1,9/18/2018,22.1,140735,22.15,22.42,22.0
2,9/17/2018,22.16,54893,22.43,22.565,22.15
3,9/14/2018,22.43,59536,22.29,22.56,22.23
4,9/13/2018,22.29,61342,22.43,22.67,22.25
5,9/12/2018,22.4,102361,22.53,22.68,22.15
6,9/11/2018,22.58,108116,22.31,22.63,22.2
7,9/10/2018,22.37,89903,22.75,22.8,22.19
8,9/7/2018,22.65,104034,22.37,22.77,22.23
9,9/6/2018,22.4,87539,22.46,22.46,22.2676


In [None]:
dfg.tail(3) # last three

Unnamed: 0,date,close,volume,open,high,low
2516,9/22/2008,3.41,231344,3.36,3.46,3.3
2517,9/19/2008,3.3,251526,3.31,3.6717,3.2
2518,9/18/2008,3.3,126877,3.4,3.45,3.27


In [None]:
dfg.columns # returns columns, can be used to loop over

Index(['date', 'close', 'volume', 'open', 'high', 'low'], dtype='object')

In [None]:
#select one column at a time
for column in dfg.columns:
    print(column)
    print(dfg[column].head(2))#you can directly use column name
    #type object because there is ',' in the number

date
0    9/19/2018
1    9/18/2018
Name: date, dtype: object
close
0    22.1
1    22.1
Name: close, dtype: float64
volume
0    142,544
1     140735
Name: volume, dtype: object
open
0    22.20
1    22.15
Name: open, dtype: float64
high
0    22.42
1    22.42
Name: high, dtype: float64
low
0    22.0
1    22.0
Name: low, dtype: float64


In [17]:
dfg.index # return,step1:increase by 1

RangeIndex(start=0, stop=2519, step=1)

In [None]:
dfg.describe()#numeric

Unnamed: 0,close,open,high,low
count,2519.0,2519.0,2519.0,2519.0
mean,15.01947,15.013484,15.205141,14.826368
std,8.99902,9.003459,9.078259,8.928512
min,1.41,1.38,1.47,1.1
25%,5.46,5.44,5.53005,5.345
50%,17.77,17.75,18.03,17.5
75%,23.62,23.6,23.85,23.3888
max,28.93,29.0,29.11,28.84


## Convert the index to pandas datetime object

In [None]:
type(dfg['date'][0])

str

In [None]:
#First way
#Convert and set index
#dfg2=dfg cannot make a copy ,just re assignment
#Convert
dfg2 = dfg.copy() #let's make a copy first
dfg2['date'] = pd.to_datetime(dfg2['date']) #convert to datetime, can specify format = '%d%b%Y:%H:%M:%S.%f'
type(dfg2['date'][0])#check type
#timestamp means you can add interval/subscribe... apply multifunction in datetime

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
#Set index
#Careful: no undo
dfg2.set_index('date', inplace=True) #set the index
dfg2.head()#date became new index

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-19,22.1,142544,22.2,22.42,22.0
2018-09-18,22.1,140735,22.15,22.42,22.0
2018-09-17,22.16,54893,22.43,22.565,22.15
2018-09-14,22.43,59536,22.29,22.56,22.23
2018-09-13,22.29,61342,22.43,22.67,22.25


In [None]:
dfg.head(2)

Unnamed: 0,date,close,volume,open,high,low
0,9/19/2018,22.1,142544,22.2,22.42,22.0
1,9/18/2018,22.1,140735,22.15,22.42,22.0


In [None]:
print(type(dfg2.index[0]))
dfg2.index[0]#no time information ,so 00:00:00

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Timestamp('2018-09-19 00:00:00')

In [None]:
#Second way
#Convert and copy to index
dfg.index = pd.to_datetime(dfg['date'])

In [None]:
dfg.head(1)#actually make duplicate info

Unnamed: 0_level_0,date,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-09-19,9/19/2018,22.1,142544,22.2,22.42,22.0


In [None]:
#Remove duplicated column
#Careful: no undo
dfg.drop(['date'],axis=1,inplace=True)

In [None]:
dfg.head(10)

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-19,22.1,142544,22.2,22.42,22.0
2018-09-18,22.1,140735,22.15,22.42,22.0
2018-09-17,22.16,54893,22.43,22.565,22.15
2018-09-14,22.43,59536,22.29,22.56,22.23
2018-09-13,22.29,61342,22.43,22.67,22.25
2018-09-12,22.4,102361,22.53,22.68,22.15
2018-09-11,22.58,108116,22.31,22.63,22.2
2018-09-10,22.37,89903,22.75,22.8,22.19
2018-09-07,22.65,104034,22.37,22.77,22.23
2018-09-06,22.4,87539,22.46,22.46,22.2676


In [18]:
print(type(dfg.index[0]))
dfg.index[0]

<class 'int'>


0

In [None]:
# We can query date indices with strings
# Only September
dfg.sort_index(inplace=True)#the index has already become date
dfg['2018-09']#extract the elements from data frame

  dfg['2018-09']


Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-04,22.42,61743,22.71,22.8,22.37
2018-09-05,22.38,95571,22.35,22.48,22.25
2018-09-06,22.4,87539,22.46,22.46,22.2676
2018-09-07,22.65,104034,22.37,22.77,22.23
2018-09-10,22.37,89903,22.75,22.8,22.19
2018-09-11,22.58,108116,22.31,22.63,22.2
2018-09-12,22.4,102361,22.53,22.68,22.15
2018-09-13,22.29,61342,22.43,22.67,22.25
2018-09-14,22.43,59536,22.29,22.56,22.23
2018-09-17,22.16,54893,22.43,22.565,22.15


In [None]:
dfg['2018-09-10':'2018-09-16']

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-10,22.37,89903,22.75,22.8,22.19
2018-09-11,22.58,108116,22.31,22.63,22.2
2018-09-12,22.4,102361,22.53,22.68,22.15
2018-09-13,22.29,61342,22.43,22.67,22.25
2018-09-14,22.43,59536,22.29,22.56,22.23


## Attributes & general statitics of a Pandas DataFrame

In [None]:
dfg.shape # 2519 business days in the past 10 years, 5 variables

(2519, 5)

In [None]:
dfg.shape[0]

2519

In [None]:
dfg.size#how many set

12595

In [None]:
dfg.columns#how many list

Index(['close', 'volume', 'open', 'high', 'low'], dtype='object')

In [None]:
# Some general statistics

dfg.describe()

Unnamed: 0,close,open,high,low
count,2519.0,2519.0,2519.0,2519.0
mean,15.01947,15.013484,15.205141,14.826368
std,8.99902,9.003459,9.078259,8.928512
min,1.41,1.38,1.47,1.1
25%,5.46,5.44,5.53005,5.345
50%,17.77,17.75,18.03,17.5
75%,23.62,23.6,23.85,23.3888
max,28.93,29.0,29.11,28.84


In [None]:
dfg['open']>15#the value of open>15

date
2008-09-18    False
2008-09-19    False
2008-09-22    False
2008-09-23    False
2008-09-24    False
              ...  
2018-09-13     True
2018-09-14     True
2018-09-17     True
2018-09-18     True
2018-09-19     True
Name: open, Length: 2519, dtype: bool

In [None]:
# Boolean indexing
dfg['open'][dfg['open']>15]  # check what dates the opening > 15,convert boolean result to exact value

date
2013-02-14    15.09
2013-02-15    15.30
2013-02-19    15.70
2013-02-20    16.34
2013-02-21    15.38
              ...  
2018-09-13    22.43
2018-09-14    22.29
2018-09-17    22.43
2018-09-18    22.15
2018-09-19    22.20
Name: open, Length: 1410, dtype: float64

In [None]:
dfg['open'][dfg['open']>15].head(5)  # check what dates the opening with the firt 5

date
2013-02-14    15.09
2013-02-15    15.30
2013-02-19    15.70
2013-02-20    16.34
2013-02-21    15.38
Name: open, dtype: float64

In [None]:
# Check where Open, High, Low and Close where greater than a specified value
dfg[dfg.open>20]#.head(3),without head can lead all the elements

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-03-21,20.38,216349,20.20,20.710,20.2000
2013-03-22,20.15,199252,20.39,20.600,20.1000
2013-03-25,20.76,237140,20.46,20.870,20.3015
2013-03-26,20.97,180448,20.93,21.000,20.6600
2013-03-27,21.12,217781,20.99,21.490,20.8101
...,...,...,...,...,...
2018-09-13,22.29,61342,22.43,22.670,22.2500
2018-09-14,22.43,59536,22.29,22.560,22.2300
2018-09-17,22.16,54893,22.43,22.565,22.1500
2018-09-18,22.10,140735,22.15,22.420,22.0000


In [None]:
dfg.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2519 entries, 2008-09-18 to 2018-09-19
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   close   2519 non-null   float64
 1   volume  2519 non-null   object 
 2   open    2519 non-null   float64
 3   high    2519 non-null   float64
 4   low     2519 non-null   float64
dtypes: float64(4), object(1)
memory usage: 182.6+ KB


In [None]:
# drop na - not run for now
# Comments on dropping and filling NaN values
# A view where we drop any rows with value NaN
# dfg.dropna(how='any')  # this would be used to drop rows with NaN any/all
# dfg.fillna(value=5)    # this would be used to fill NaN values with 5

In [None]:
# If you want the values in an np array
npg = dfg.values #convert some data from numpy matrix
print(type(npg))
print(npg)

<class 'numpy.ndarray'>
[[3.3 '126877' 3.4 3.45 3.27]
 [3.3 '251526' 3.31 3.6717 3.2]
 [3.41 '231344' 3.36 3.46 3.3]
 ...
 [22.16 '54893' 22.43 22.565 22.15]
 [22.1 '140735' 22.15 22.42 22.0]
 [22.1 '142,544' 22.2 22.42 22.0]]


## Selecting or Viewing Data within a DataFrame
Note: While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, .at, .iat, .loc, .iloc and .ix. (from 10 min guide to Pandas)


In [None]:
# Lets print the five first Close prices for Google
# This is a new Series (like a new table)
dfg['close'][0:5]#first 5 row

date
2008-09-18    3.30
2008-09-19    3.30
2008-09-22    3.41
2008-09-23    3.48
2008-09-24    3.28
Name: close, dtype: float64

In [None]:
# Lets print the 3 column
dfg[['close','open','volume']][0:7]

Unnamed: 0_level_0,close,open,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-09-18,3.3,3.4,126877
2008-09-19,3.3,3.31,251526
2008-09-22,3.41,3.36,231344
2008-09-23,3.48,3.45,264652
2008-09-24,3.28,3.46,129769
2008-09-25,3.37,3.3,126197
2008-09-26,3.42,3.46,62560


In [None]:
# A slice: by rows (row numbers)
dfg[1:5] # 2nd to 5th element

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-19,3.3,251526,3.31,3.6717,3.2
2008-09-22,3.41,231344,3.36,3.46,3.3
2008-09-23,3.48,264652,3.45,3.75,3.36
2008-09-24,3.28,129769,3.46,3.58,3.26


In [None]:
# As in lists
list1 = [0,1,2,3,4]
list1[1:5]

[1, 2, 3, 4]

## .loc()

In [None]:
# Getting a cross section with .loc - BY VALUES of the index and columns
# df.loc[a:b, x:y], by rows and column location

# Note: You have to know indices and columns

dfg.loc['2017-08-21':'2017-08-31',#get the location by value
        'open':'low']

Unnamed: 0_level_0,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-21,24.14,24.16,23.96
2017-08-22,24.36,24.52,23.96
2017-08-23,24.13,24.51,23.77
2017-08-24,24.41,24.96,24.22
2017-08-25,24.26,24.8,24.26
2017-08-28,24.42,24.53,24.08
2017-08-29,24.24,24.52,24.19
2017-08-30,24.27,24.4,23.91
2017-08-31,24.27,24.54,24.27


## .iloc()

In [None]:
dfg.columns

Index(['close', 'volume', 'open', 'high', 'low'], dtype='object')

In [None]:
# .iloc slicing at specific location - BY POSITION in the table
# Recall:
# dfg[a:b] by rows
# dfg[[col]] or df[[col1, col2]] by columns
# df.loc[a:b, x:y], by index and column values + location
# df.iloc[3:5,0:2], numeric position in table

dfg.iloc[1:4,3:5] # 2nd to 4th row, 4th to 5th column use position of column

Unnamed: 0_level_0,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-09-19,3.6717,3.2
2008-09-22,3.46,3.3
2008-09-23,3.75,3.36


In [None]:
# Data only from row with index value '3'
print (dfg.iloc[3])

close       3.48
volume    264652
open        3.45
high        3.75
low         3.36
Name: 2008-09-23 00:00:00, dtype: object


In [None]:
# iloc will accept a range with ':', just like numpy
# : means "all"
dfg.iloc[1:3,:]#order column

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-19,3.3,251526,3.31,3.6717,3.2
2008-09-22,3.41,231344,3.36,3.46,3.3


In [None]:
# Can also return specific value
print (dfg.iloc[2,1])
# same as above but faster for one single scalar value
print (dfg.iat[2,1])#I add index

231344
231344


## More Basic Statistics

In [None]:
# A quick way to get statistics
dfg.describe()

Unnamed: 0,close,open,high,low
count,2519.0,2519.0,2519.0,2519.0
mean,15.01947,15.013484,15.205141,14.826368
std,8.99902,9.003459,9.078259,8.928512
min,1.41,1.38,1.47,1.1
25%,5.46,5.44,5.53005,5.345
50%,17.77,17.75,18.03,17.5
75%,23.62,23.6,23.85,23.3888
max,28.93,29.0,29.11,28.84


The result of describe is a DataFrame, so you can access and slice like a normal DataFrame

In [None]:
dfg.describe()['open'][1]#like iloc

15.013483604605002

In [None]:
dfg.describe()[1:3]

Unnamed: 0,close,open,high,low
mean,15.01947,15.013484,15.205141,14.826368
std,8.99902,9.003459,9.078259,8.928512


In [None]:
dfg.describe().loc[['mean','std'],['high','low']]#?

Unnamed: 0,high,low
mean,15.205141,14.826368
std,9.078259,8.928512


In [None]:
dfg.head()

In [None]:
# We can change the index sorting
# Not in place, should specify inplace=True to record it
dfg.sort_index(axis=0, ascending=True).head()#ascending=False means descending

#Short for:
##dfg.sort_index(axis=0, ascending=True)
##dfg.head()

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-18,3.3,126877,3.4,3.45,3.27
2008-09-19,3.3,251526,3.31,3.6717,3.2
2008-09-22,3.41,231344,3.36,3.46,3.3
2008-09-23,3.48,264652,3.45,3.75,3.36
2008-09-24,3.28,129769,3.46,3.58,3.26


In [None]:
# The dataframe didn't change
dfg.head(3)

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-18,3.3,126877,3.4,3.45,3.27
2008-09-19,3.3,251526,3.31,3.6717,3.2
2008-09-22,3.41,231344,3.36,3.46,3.3


In [None]:
# sort by value
dfg.sort_values(by='open', ascending=True)[0:10]

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-03-24,1.5,39300,1.38,1.51,1.38
2009-03-20,1.45,76175,1.41,1.47,1.4
2009-03-23,1.47,137700,1.42,1.525,1.42
2009-03-19,1.41,189240,1.43,1.48,1.365
2009-03-13,1.52,83325,1.49,1.57,1.49
2009-03-25,1.49,30926,1.49,1.52,1.49
2009-04-01,1.59,144192,1.49,1.59,1.49
2009-03-12,1.49,157672,1.5,1.5464,1.49
2009-03-16,1.51,66174,1.5,1.53,1.5
2009-03-18,1.49,26360,1.5,1.53,1.49


## Masks and Boolean Indexing

In [None]:
dfg[0:10]

Unnamed: 0_level_0,close,volume,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-18,3.3,126877,3.4,3.45,3.27
2008-09-19,3.3,251526,3.31,3.6717,3.2
2008-09-22,3.41,231344,3.36,3.46,3.3
2008-09-23,3.48,264652,3.45,3.75,3.36
2008-09-24,3.28,129769,3.46,3.58,3.26
2008-09-25,3.37,126197,3.3,3.48,3.28
2008-09-26,3.42,62560,3.46,3.5,3.25
2008-09-29,3.95,88871,3.43,4.134,3.2
2008-09-30,3.5,111006,4.0,4.05,3.35
2008-10-01,3.55,155579,3.59,3.67,3.41


In [None]:
# mask 1
mg1 = dfg['open'][0:10]>22.40
print (mg1)

date
2008-09-18    False
2008-09-19    False
2008-09-22    False
2008-09-23    False
2008-09-24    False
2008-09-25    False
2008-09-26    False
2008-09-29    False
2008-09-30    False
2008-10-01    False
Name: open, dtype: bool


In [None]:
dfg['open'][0:10][dfg['open']>3.40]
# shows only rows with opening price greater than 22.40

date
2008-09-23    3.45
2008-09-24    3.46
2008-09-26    3.46
2008-09-29    3.43
2008-09-30    4.00
2008-10-01    3.59
Name: open, dtype: float64

In [None]:
# mask 2 full data frame - only if the dataframe columns are allo the same type
# mg2 = dfg[0:10]>22.50
# mg2

In [None]:
#dfg[dfg>22.50].head(10)

In [None]:
# we can also drop all NaN values
#dfg[dfg>22.50].head(10).dropna()

In [None]:
# like Numpy, sometimes you need an actual copy, not a view or slice of the same data
# any operation done on dfg or dfg_same is reflected on the other
dfg_same = dfg
dfg_same is dfg

True

In [None]:
# Now we create a second copy of the dataframe, they will be independent from each other
dfg2 = dfg.copy()
dfg2 is dfg

False

# Aknowledgements

- The codes in this notebook are modified from various sources including Dr. Xinzhi Zhang Jupyter notebooks.
- All codes are for educational purposes only and released under the CC1.0.