## **TODO:** Set the value of `URL` to the URL from your learning materials

In [None]:
URL = None
import os
assert URL and (type(URL) is str), "Be sure to initialize URL using the value from your learning materials"
os.environ['URL'] = URL

In [None]:
%%bash
wget -q $URL -O ./data.zip
mkdir -p data
find *.zip | xargs unzip -o -d data/

# Demo - Pandas

In [None]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

In [None]:
# view raw values
data.values

In [None]:
# view index
data.index

In [None]:
# we can index, just like a standard Python list
data[1]

In [None]:
# Because index is a RangeIndex, we can do
# normal slicing from offset 1 to offset 2
# 
data[::-1]

In [None]:
# create a series with non-integer indices
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [None]:
data

In [None]:
# similar to dict indexing
data['c']

In [None]:
# if you are particularly perverse...
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data

In [None]:
data.loc[3]

# Implicit and Explicit Indexing

In [None]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

In [None]:
# index is no longer a range
# indices are like keys in a dictionary
data.index

In [None]:
data[1] # index by numeric index, not offset!

In [None]:
# but slicing still works as before
data.loc[1:3] # offset 1...offset 2

# __`loc`__ and __`iloc`__
* .loc is a __*label*__-based indexing method
* .iloc is an __*position*__-based indexing method

In [None]:
data

In [None]:
data.loc[1] # 1 here is a label, not an offset

In [None]:
data.loc[1:2] # 1 and 3 are labels, not integer offsets

In [None]:
data.iloc[1] # 1 is an offset, not a label

In [None]:
data.iloc[1:3] # 1..3 is a Python slice based on offsets

# Python Dicts as Series

In [None]:
population_dict = {'California': 38332521,
                       'Texas': 26448193,
                       'New York': 19651127,
                       'Florida': 19552860,
                       'Illinois': 12882135,
                        'North Dakota': 10000000}
population = pd.Series(population_dict)
population

In [None]:
population[::-1]['New York':'California']

In [None]:
population['California':'New York']

# Pandas DataFrame

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
                 'Florida': 170312, 'Illinois': 149995, 'Rhode Island': 1000}
area = pd.Series(area_dict)
area

In [None]:
states = pd.DataFrame(data=[population.values, area.values]).transpose()

In [None]:
states.index

In [None]:
states.columns = ['pop', 'area']
states

In [None]:
states['area']

In [None]:
states.values

In [None]:
# get descriptive statistics
states.describe()

# Sales Data

In [None]:
dat = pd.read_csv("data/WA_Fn-UseC_-Sales-Win-Loss.csv")

In [None]:
dat.columns

In [None]:
dat['Opportunity Result']

# Counting Values

In [None]:
dat['Opportunity Result'].value_counts()

In [None]:
dat['Supplies Group'].value_counts()

In [None]:
dat['Elapsed Days In Sales Stage'].value_counts().sort_values(ascending = True)

# Top Five Values

In [None]:
dat['Supplies Subgroup'].value_counts()

# Extracting Columns

In [None]:
dat[:5].T

In [None]:
region_results = dat[["Region", "Opportunity Result", "Sales Stage Change Count"]]
dat.describe()

In [None]:
region_results.shape

In [None]:
region_results.head()

# Setting the Index
* oftentimes the index of the DataFrame is something we don't care about, e.g., a default numeric index or sequence number

In [None]:
presidents = pd.DataFrame([
    { 'name': 'James Madison', 'elect': 1808, 'born': 1751 },
    { 'name': 'Thomas Jefferson', 'elect': 1800, 'born': 1743 },
    { 'name': 'John Adams', 'elect': 1796, 'born': 1735 },
    { 'name': 'George Washington', 'elect': 1788, 'born': 1724 },
])
presidents

In [None]:
# the default numeric isn't interesting
# let's use name as the index
presidents.set_index('name', inplace=True)

In [None]:
presidents.index

Copyright 2021 CounterFactual.AI LLC. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.