## Intro to Dataframes

In [118]:
import pandas as pd 
import numpy as np

In [119]:
df = pd.DataFrame([[11,22,33],[44,55,66],[77,88,99],[10,20,30]], columns=["A", "B", "C"], index=["x","y","z",'zz']) # create a dataframe

In [120]:
df.head() # show the first 5 rows of the dataframe

Unnamed: 0,A,B,C
x,11,22,33
y,44,55,66
z,77,88,99
zz,10,20,30


In [121]:
df.index # show the index of the dataframe

Index(['x', 'y', 'z', 'zz'], dtype='object')

In [122]:
df.describe() # show the summary statistics of the dataframe

Unnamed: 0,A,B,C
count,4.0,4.0,4.0
mean,35.5,46.25,57.0
std,31.859065,32.128648,32.403703
min,10.0,20.0,30.0
25%,10.75,21.5,32.25
50%,27.5,38.5,49.5
75%,52.25,63.25,74.25
max,77.0,88.0,99.0


In [123]:
df.nunique() # number of unique values in each column

A    4
B    4
C    4
dtype: int64

In [124]:
df["A"].unique() # unique values in column A

array([11, 44, 77, 10], dtype=int64)

In [125]:
df.shape  # show the shape of the dataframe

(4, 3)

In [126]:
df.info() # show the information of the dataframe

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, x to zz
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       4 non-null      int64
 1   B       4 non-null      int64
 2   C       4 non-null      int64
dtypes: int64(3)
memory usage: 128.0+ bytes


In [127]:
df.size # show the size of the dataframe

12

## Loading in Dataframes from Files

In [128]:
# coffe = pd.read_csv('./warmup-data/coffee.csv') # read the coffee.csv file

In [129]:
coffe.head()    # show the first 5 rows of the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [130]:
# result = pd.read_parquet("./data/results.parquet")  # read the results.parquet file

In [131]:
result.head()   # show the first 5 rows of the dataframe

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [132]:
# olympics_data = pd.read_excel("./data/olympics-data.xlsx")  # read the olympics-data.xlsx file

In [133]:
olympics_data.head()  # show the first 5 rows of the dataframe

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [134]:
# olympics_data = pd.read_excel("./data/olympics-data.xlsx", sheet_name = "results")  # read the olympics-data.xlsx file with sheet name results

In [135]:
olympics_data.head()  # show the first 5 rows of the dataframe

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [136]:
# bios = pd.read_csv("./data/bios.csv") # read the csv file

## Acessing Data with Pandas

In [139]:
coffe.head()  # show the first 5 rows of the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [140]:
print(coffe) # print the dataframe

          Day Coffee Type  Units Sold
0      Monday    Espresso          25
1      Monday       Latte          15
2     Tuesday    Espresso          30
3     Tuesday       Latte          20
4   Wednesday    Espresso          35
5   Wednesday       Latte          25
6    Thursday    Espresso          40
7    Thursday       Latte          30
8      Friday    Espresso          45
9      Friday       Latte          35
10   Saturday    Espresso          45
11   Saturday       Latte          35
12     Sunday    Espresso          45
13     Sunday       Latte          35


In [141]:
coffe.head(10)  # show the first 10 rows of the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [142]:
coffe.tail(5)  # show the last 5 rows of the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [143]:
coffe.sample(10)  # show a random sample of 10 rows of the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
6,Thursday,Espresso,40
10,Saturday,Espresso,45
9,Friday,Latte,35
7,Thursday,Latte,30
4,Wednesday,Espresso,35
2,Tuesday,Espresso,30
8,Friday,Espresso,45
5,Wednesday,Latte,25
1,Monday,Latte,15
12,Sunday,Espresso,45


In [144]:
coffe.loc[0]    # show the first row of the dataframe

Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object

In [145]:
coffe.loc[[0,1,2]]  # show the first 3 rows of the dataframe

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30


In [148]:
coffe.loc[:,["Day", "Units Sold"]]  # show the Day and UnitsSold columns

Unnamed: 0_level_0,Day,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
Monday,Monday,25
Monday,Monday,15
Tuesday,Tuesday,30
Tuesday,Tuesday,20
Wednesday,Wednesday,35
Wednesday,Wednesday,25
Thursday,Thursday,40
Thursday,Thursday,30
Friday,Friday,45
Friday,Friday,35


In [153]:
coffe.index = coffe["Day"]  # set the index to the Day column

In [None]:
coffe.loc[1, "Units Sold"] = 10  # show the value of the Units Sold column in the first row

In [None]:
coffe.head()

In [154]:
coffe = ["Day"]  # set the index to the Day column