
# The Basics

In [58]:
import numpy as np
import pandas as pd

Reading files with Pandas

In [59]:
# Make sure you have pandas imported first!

# To read from a CSV (Comma-Separated Values) file
#csv_data = pd.read_csv("path/to/file/data.csv")

# To read from a JSON (JavaScript Object Notation) file
#json_data = pd.read_json("path/to/file/data.json")

# To read from a HTML (Hypertext Markup Language) file
#html_data = pd.read_html("path/to/file/data.html")

Seeing our data

In [60]:
data = pd.read_csv("chopped.csv")
data.head()

Unnamed: 0,season,season_episode,series_episode,episode_name,episode_notes,air_date,judge1,judge2,judge3,appetizer,entree,dessert,contestant1,contestant1_info,contestant2,contestant2_info,contestant3,contestant3_info,contestant4,contestant4_info
0,1,1,1,"""Octopus, Duck, Animal Crackers""",This is the first episode with only three offi...,13-Jan-09,Marc Murphy,Alex Guarnaschelli,Aarón Sánchez,"baby octopus, bok choy, oyster sauce, smoked ...","duck breast, green onions, ginger, honey","prunes, animal crackers, cream cheese",Summer Kriegshauser,Private Chef and Nutrition Coach New York NY,Perry Pollaci,Private Chef and Sous chef Bar Blanc New Yo...,Katie Rosenhouse,Pastry Chef Olana Restaurant New York NY,Sandy Davis,Catering Chef Showstoppers Catering at Union...
1,1,2,2,"""Tofu, Blueberries, Oysters""",This is the first of a few episodes with five ...,20-Jan-09,Aarón Sánchez,Alex Guarnaschelli,Marc Murphy,"firm tofu, tomato paste, prosciutto","daikon, pork loin, Napa cabbage, Thai chiles,...","phyllo dough, gorgonzola cheese, pineapple ri...",Raymond Jackson,Private Caterer and Culinary Instructor West...,Klaus Kronsteiner,Chef de cuisine Liberty National Golf Course...,Christopher Jackson,Executive Chef and Owner Ted and Honey Broo...,Pippa Calland,Owner and Chef Chef for Hire LLC Newville PA
2,1,3,3,"""Avocado, Tahini, Bran Flakes""",,27-Jan-09,Aarón Sánchez,Alex Guarnaschelli,Marc Murphy,"lump crab meat, dried shiitake mushrooms, pin...","ground beef, cannellini beans, tahini paste, ...","brioche, cantaloupe, pecans, avocados",Margaritte Malfy,Executive Chef and Co-owner La Palapa New Y...,Rachelle Rodwell,Chef de cuisine SoHo Grand Hotel New York NY,Chris Burke,Private Chef New York NY,Andre Marrero,Chef tournant L’Atelier de Joël Robuchon Ne...
3,1,4,4,"""Banana, Collard Greens, Grits""","In the appetizer round, Chef Chuboda refused t...",3-Feb-09,Scott Conant,Amanda Freitag,Geoffrey Zakarian,"ground beef, wonton wrappers, cream of mushro...","scallops, collard greens, anchovies, sour cream","maple syrup, black plums, almond butter, waln...",Sean Chudoba,Executive Chef Ayza Wine Bar New York NY,Kyle Shadix,Chef Registered Dietician and Culinary Consu...,Luis Gonzales,Executive Chef Knickerbocker Bar & Grill Ne...,Einat Admony,Chef and Owner Taïm New York NY
4,1,5,5,"""Yucca, Watermelon, Tortillas""",,10-Feb-09,Geoffrey Zakarian,Alex Guarnaschelli,Marc Murphy,"watermelon, canned sardines, pepper jack chee...","beef shoulder, yucca, raisins, ancho chiles, ...","flour tortillas, prosecco, Canadian bacon, ro...",John Keller,Personal Chef New York NY,Andrea Bergquist,Executive Chef New York NY,Ed Witt,Executive Chef / Wine Director Bloomingdale ...,Josh Emett,Chef de cuisine Gordon Ramsay at The London ...


# The Series

#### Creating Series

In [61]:
# Creating a series from a list, an index is optional
a = pd.Series([1,2,3,4], index = ['a','b','c','d'])

# Creating a series from an ndarray, an index is optional
b = pd.Series(np.array([1,2,3,4]), index = ['a','b','c','d'])

# Creating a series from a dictionary
c = pd.Series({'a':1, 'b':2, 'c':3, 'd':4})

# Creating a series from a scalar value, an index is mandatory
d = pd.Series(5, ['a','b','c','d'])

# Note a, b, and c are all the equivalent
print(a)
print(d)



a    1
b    2
c    3
d    4
dtype: int64
a    5
b    5
c    5
d    5
dtype: int64


#### Series are ndarray-like

In [62]:
# Fetching information based off an index range
a[0:2]

# Fetching only values greater than one
a[a > 1]

# e to the power of each value in a Series
np.exp(a)

a     2.718282
b     7.389056
c    20.085537
d    54.598150
dtype: float64

In [63]:
# The scalar addition gets "stretched" to a matrix addition
print(a)
print(a + 3)

# [1                                  [1       [3
#  2   +   3   Gets "stretched" into   2   +    3
#  3                                   3        3
#  4]                                  4]       3]

a    1
b    2
c    3
d    4
dtype: int64
a    4
b    5
c    6
d    7
dtype: int64


#### Series are also dictionary-like

In [64]:
# Can fetch values given a key
a['a']

1

In [65]:
a['b'] = 5
print(a)

a    1
b    5
c    3
d    4
dtype: int64


In [66]:
# Can check if a key/label is in the Series
assert 'c' in a

In [67]:
# Creating a DataFrame from a list of Series
i = pd.DataFrame([a,b,c,d])
print(i)

# Can also specify an index
my_index = ["one","two","three","four"]
i.index = my_index
print(i)

# Can be done in the original creation as well
# i = pd.DataFrame([a,b,c,d], index = my_index)

   a  b  c  d
0  1  5  3  4
1  1  2  3  4
2  1  2  3  4
3  5  5  5  5
       a  b  c  d
one    1  5  3  4
two    1  2  3  4
three  1  2  3  4
four   5  5  5  5


# The DataFrame

#### Data Frame Creation

In [68]:
# Creating a DataFrame from a dictionary, index is optional
j = pd.DataFrame({
    "age": [64,19,32,56,22],
    "gender": ["male", "female", "female", "male", "female"],
    "fav decimal": [1.2, 3.2, 4.8, 0.8, 0.33]
}, index = ["Jordan", "Laurel", "Emily", "Alex", "Sarah"])
print(j)

        age  gender  fav decimal
Jordan   64    male         1.20
Laurel   19  female         3.20
Emily    32  female         4.80
Alex     56    male         0.80
Sarah    22  female         0.33


In [69]:
# Creating a DataFrame from a 2D array
# And we can set the columns of it
my_2D_array = [["Jordan", "male", 19, 1], ["Laurel","female",19,2]]
k = pd.DataFrame(my_2D_array, columns = ["name", "gender", "age", "dogs"])
print(k)

     name  gender  age  dogs
0  Jordan    male   19     1
1  Laurel  female   19     2


In [70]:
# Additionally we can specify one of the columns to be the index
k.index = k["name"]

# And drop the now duplicated column
k.pop("name")
print(k)

        gender  age  dogs
name                     
Jordan    male   19     1
Laurel  female   19     2


#### DataFrame getting and setting

In [71]:
# Grabbing the entire age column as a series
j["age"]

Jordan    64
Laurel    19
Emily     32
Alex      56
Sarah     22
Name: age, dtype: int64

In [72]:
# Grabbing a slice of the age column as a series
j["age"][0:3]

Jordan    64
Laurel    19
Emily     32
Name: age, dtype: int64

In [73]:
# Getting a series from the row with index "Jordan"
j.loc["Jordan"]

age              64
gender         male
fav decimal     1.2
Name: Jordan, dtype: object

In [74]:
# Getting the value from the row "Jordan" and column "age"
j.loc["Jordan", "age"]

64

In [75]:
# Setting the value of age for Jordan
j.loc["Jordan", "age"] = 19
print(j)

        age  gender  fav decimal
Jordan   19    male         1.20
Laurel   19  female         3.20
Emily    32  female         4.80
Alex     56    male         0.80
Sarah    22  female         0.33


In [76]:
# Defining a potential new row for our DataFrame
new_entry = pd.Series({"age": 19, "gender": "male","fav decimal": 0.42 },
                     name = "Newman")

# Adding it to our DataFrame
j = j.append(new_entry)
print(j)

        age  gender  fav decimal
Jordan   19    male         1.20
Laurel   19  female         3.20
Emily    32  female         4.80
Alex     56    male         0.80
Sarah    22  female         0.33
Newman   19    male         0.42


In [77]:
# Only select rows where age is greater than 25
j[j["age"] > 25]

Unnamed: 0,age,gender,fav decimal
Emily,32,female,4.8
Alex,56,male,0.8


In [78]:
# If you look at the output of j["age"] > 25, it is a boolean series
# which allows us to "mask" for only values that are true
print(j["age"] > 25)

Jordan    False
Laurel    False
Emily      True
Alex       True
Sarah     False
Newman    False
Name: age, dtype: bool


#### DataFrame Manipulation

In [79]:
k = pd.DataFrame({
    "Unit Price": [300, 20, 30],
    "Quantity": [3, 20, 14]
}, index = ["Nintendo Switch", "Charger", "Earbud"])
print(k)

                 Unit Price  Quantity
Nintendo Switch         300         3
Charger                  20        20
Earbud                   30        14


In [46]:
# Defining the total sales from an item
k["Total Sales"] = k["Unit Price"] * k["Quantity"]
print(k)

                 Unit Price  Quantity  Total Sales
Nintendo Switch         300         3          900
Charger                  20        20          400
Earbud                   30        14          420


In [47]:
# Defining the percentage of all sales for each item
k["Ratio of All Sales"] = k["Total Sales"]/k["Total Sales"].sum()
print(k)

                 Unit Price  Quantity  Total Sales  Ratio of All Sales
Nintendo Switch         300         3          900            0.523256
Charger                  20        20          400            0.232558
Earbud                   30        14          420            0.244186
