In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Structures 

In [2]:
# Create a Pandas Series that stores a grocery list
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [3]:
# Print some information about Groceries
print('Groceries shape:', groceries.shape)
print('Groceries dimension:', groceries.ndim)
print('Groceries # of elements:', groceries.size)

Groceries shape: (4,)
Groceries dimension: 1
Groceries # of elements: 4


In [4]:
# Print the index and data of Groceries
print('Data in Groceries:', groceries.values)
print('Index of Groceries:', groceries.index)

Data in Groceries: [30 6 'Yes' 'No']
Index of Groceries: Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')


In [5]:
# Create a dictionary of Pandas Series 
items = {'Seiji' : pd.Series(data = [245, 25, 500], index = ['bike', 'book', 'violin']),
         'Shizuku' : pd.Series(data = [40, 100, 300, 10], index = ['book', 'bag', 'bike', 'flowers'])}

In [6]:
# Create a Pandas DataFrame by passing it a dictionary of Pandas Series
shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Seiji,Shizuku
bag,,100.0
bike,245.0,300.0
book,25.0,40.0
flowers,,10.0
violin,500.0,


In [7]:
# Create a dictionary of Pandas Series without indexes
data = {'Seiji' : pd.Series([245, 25, 500]),
        'Shizuku' : pd.Series([40, 100, 300, 10])}

In [8]:
# Create a Pandas DataFrame by passing it a dictionary of Pandas Series
prices = pd.DataFrame(data)
prices

Unnamed: 0,Seiji,Shizuku
0,245.0,40
1,25.0,100
2,500.0,300
3,,10


In [9]:
# Print some information about shopping_carts
print('shopping_carts shape:', shopping_carts.shape)
print('shopping_carts dimension:', shopping_carts.ndim)
print('shopping_carts # of elements:', shopping_carts.size)
print()
print('shopping_carts data:\n', shopping_carts.values)
print()
print('shopping_carts row indexes:', shopping_carts.index)
print()
print('shopping_carts is column indexes:', shopping_carts.columns)

shopping_carts shape: (5, 2)
shopping_carts dimension: 2
shopping_carts # of elements: 10

shopping_carts data:
 [[ nan 100.]
 [245. 300.]
 [ 25.  40.]
 [ nan  10.]
 [500.  nan]]

shopping_carts row indexes: Index(['bag', 'bike', 'book', 'flowers', 'violin'], dtype='object')

shopping_carts is column indexes: Index(['Seiji', 'Shizuku'], dtype='object')


In [10]:
# Create a DataFrame that only has selected items for both Seiji and Shizuku
sel_shopping_cart = pd.DataFrame(items, index = ['flowers', 'book'])
sel_shopping_cart

Unnamed: 0,Seiji,Shizuku
flowers,,10
book,25.0,40


In [11]:
# Create a DataFrame that only has selected items for Seiji
seiji_sel_shopping_cart = pd.DataFrame(items, index = ['book', 'bike'], columns = ['Seiji'])
seiji_sel_shopping_cart

Unnamed: 0,Seiji
book,25
bike,245


In [12]:
# Create a list of Python dictionaries
items2 = [{'bikes': 20, 'milk': 30, 'flowers': 35}, 
          {'flowers': 10, 'glasses': 50, 'bikes': 15, 'milk':5}]
items2

[{'bikes': 20, 'milk': 30, 'flowers': 35},
 {'flowers': 10, 'glasses': 50, 'bikes': 15, 'milk': 5}]

In [13]:
# Create a DataFrame  and provide the row index
store_items = pd.DataFrame(items2, index = ['lolo store', 'lola store'])
store_items

Unnamed: 0,bikes,milk,flowers,glasses
lolo store,20,30,35,
lola store,15,5,10,50.0


## Dealing with NaN 

In [14]:
# Create a list of Python dictionaries
males = [{'First Name': 'Seiji', 'Last Name': 'Amasawa', 'Age': 15},
    {'First Name': 'Haku',  'Age': 12},
    {'First Name': 'Howl', 'Last Name': 'Pendragon', 'Age': 27},
    {'First Name': 'Shun', 'Last Name': 'Kazama', 'Age': 17}]

# Create a DataFrame  and provide the row index
ghibli_mc = pd.DataFrame(males , index = ['Whispers of the Heart', 'Spirited Away', 'Howl\'s Moving Castle', 'From Up on Poppy Hill'])

In [15]:
ghibli_mc

Unnamed: 0,First Name,Last Name,Age
Whispers of the Heart,Seiji,Amasawa,15
Spirited Away,Haku,,12
Howl's Moving Castle,Howl,Pendragon,27
From Up on Poppy Hill,Shun,Kazama,17


In [16]:
# Count the number of NaN values in ghibli_mc
total_nan = ghibli_mc.isnull().sum().sum()
total_nan

1

In [17]:
# Return boolean True/False for each element if it is a NaN
ghibli_mc.isnull()

Unnamed: 0,First Name,Last Name,Age
Whispers of the Heart,False,False,False
Spirited Away,False,True,False
Howl's Moving Castle,False,False,False
From Up on Poppy Hill,False,False,False


In [18]:
# Count NaN down the column
ghibli_mc.isnull().sum()

First Name    0
Last Name     1
Age           0
dtype: int64

In [19]:
# Count total non-NaN values
ghibli_mc.count()

First Name    4
Last Name     3
Age           4
dtype: int64

In [20]:
# Drop any rows with NaN values
ghibli_mc.dropna(axis = 0)

Unnamed: 0,First Name,Last Name,Age
Whispers of the Heart,Seiji,Amasawa,15
Howl's Moving Castle,Howl,Pendragon,27
From Up on Poppy Hill,Shun,Kazama,17


In [21]:
# Drop any columns with NaN values
ghibli_mc.dropna(axis = 1)

Unnamed: 0,First Name,Age
Whispers of the Heart,Seiji,15
Spirited Away,Haku,12
Howl's Moving Castle,Howl,27
From Up on Poppy Hill,Shun,17


In [25]:
# Replace NaN values with suitable value
ghibli_mc.fillna('Unknown')

Unnamed: 0,First Name,Last Name,Age
Whispers of the Heart,Seiji,Amasawa,15
Spirited Away,Haku,Unknown,12
Howl's Moving Castle,Howl,Pendragon,27
From Up on Poppy Hill,Shun,Kazama,17


## Reading Data from a CSV file 

In [26]:
# Load google stock data in a DataFrame
google_stock = pd.read_csv('./goog.csv')

In [27]:
# Print some information about google_stock
print('google_stock type:', type(google_stock))
print('google_stock shape:', google_stock.shape)

google_stock type: <class 'pandas.core.frame.DataFrame'>
google_stock shape: (3313, 7)


In [28]:
google_stock

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.805050,53.805050,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,2004-08-24,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,2004-08-25,52.140873,53.651051,51.604362,52.657513,52.657513,9257400
...,...,...,...,...,...,...,...
3308,2017-10-09,980.000000,985.424988,976.109985,977.000000,977.000000,891400
3309,2017-10-10,980.000000,981.570007,966.080017,972.599976,972.599976,968400
3310,2017-10-11,973.719971,990.710022,972.250000,989.250000,989.250000,1693300
3311,2017-10-12,987.450012,994.119995,985.000000,987.830017,987.830017,1262400


In [29]:
# Show the first 5 rows of DataFrame
google_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.80505,53.80505,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,2004-08-24,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,2004-08-25,52.140873,53.651051,51.604362,52.657513,52.657513,9257400


In [31]:
# Check if any column contains NaN
google_stock.isnull().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [32]:
# Get descriptive statistics on our stock data
google_stock.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0
mean,380.186092,383.49374,376.519309,380.072458,380.072458,8038476.0
std,223.81865,224.974534,222.473232,223.85378,223.85378,8399521.0
min,49.274517,50.541279,47.669952,49.681866,49.681866,7900.0
25%,226.556473,228.394516,224.003082,226.40744,226.40744,2584900.0
50%,293.312286,295.433502,289.929291,293.029114,293.029114,5281300.0
75%,536.650024,540.0,532.409973,536.690002,536.690002,10653700.0
max,992.0,997.210022,989.0,989.679993,989.679993,82768100.0
