## Intro to Dataframes

### Basics

In [185]:
import pandas as pd
import numpy as np

In [186]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[1,2,3]], columns=["Column 1", "Column 2", "Column 3"], index=["1","2","3",'4','5'])

In [187]:
df.index.tolist()

['1', '2', '3', '4', '5']

In [188]:
df.head()

Unnamed: 0,Column 1,Column 2,Column 3
1,1,2,3
2,4,5,6
3,7,8,9
4,10,11,12
5,1,2,3


In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Column 1  5 non-null      int64
 1   Column 2  5 non-null      int64
 2   Column 3  5 non-null      int64
dtypes: int64(3)
memory usage: 160.0+ bytes


In [190]:
df.describe()

Unnamed: 0,Column 1,Column 2,Column 3
count,5.0,5.0,5.0
mean,4.6,5.6,6.6
std,3.911521,3.911521,3.911521
min,1.0,2.0,3.0
25%,1.0,2.0,3.0
50%,4.0,5.0,6.0
75%,7.0,8.0,9.0
max,10.0,11.0,12.0


In [191]:
df.shape

(5, 3)

In [192]:
df.nunique()

Column 1    4
Column 2    4
Column 3    4
dtype: int64

#### Unique in a specific column

In [193]:
df['Column 1'].unique()

array([ 1,  4,  7, 10])

In [194]:
df

Unnamed: 0,Column 1,Column 2,Column 3
1,1,2,3
2,4,5,6
3,7,8,9
4,10,11,12
5,1,2,3


### Loading in Dataframes from file

In [195]:
coffee = pd.read_csv('./warmup-data/coffee.csv')
results = pd.read_csv('./data/results.csv')

In [196]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [197]:
bios = pd.read_csv(r"C:\Users\leopa\Git\Awesome_Pandas_Tutorial\data\bios.csv")

In [198]:
coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [199]:
coffee.tail(5)

Unnamed: 0,Day,Coffee Type,Units Sold
9,Friday,Latte,35
10,Saturday,Espresso,45
11,Saturday,Latte,35
12,Sunday,Espresso,45
13,Sunday,Latte,35


In [200]:
coffee.sample(3, random_state=1) #fixes selected sample rows

Unnamed: 0,Day,Coffee Type,Units Sold
3,Tuesday,Latte,20
7,Thursday,Latte,30
6,Thursday,Espresso,40


### Loc


In [201]:
coffee

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


In [202]:
coffee.loc[0] # single row

Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object

In [203]:
coffee.loc[[0,1,2]] # multiple rows

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30


In [204]:
coffee.loc[0:4, ["Day", "Units Sold"]] # selected rows and columns

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20
4,Wednesday,35


In [205]:
coffee.loc[:, ["Day", "Coffee Type"]] # all rows and selected columns

Unnamed: 0,Day,Coffee Type
0,Monday,Espresso
1,Monday,Latte
2,Tuesday,Espresso
3,Tuesday,Latte
4,Wednesday,Espresso
5,Wednesday,Latte
6,Thursday,Espresso
7,Thursday,Latte
8,Friday,Espresso
9,Friday,Latte


### ILOC

In [206]:
coffee.iloc[0:4, [0,1,2]] # selected rows and columns by passing index

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20


In [207]:
# coffee.index = coffee["Day"] # Swap index to column name

In [208]:
# coffee.loc["Monday":"Wednesday"] # filter by rows using index

In [209]:
coffee.loc[1, "Units Sold"] = 10 # modify the Units Sold from 15 to 10 at index 1

In [210]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,10
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [211]:
coffee.loc[1,"Coffee Type"] = "Espresso" # modify the Coffee Type at index 1

In [212]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Espresso,10
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [213]:
coffee.Day  # if column has no spaces you can use .notation

0        Monday
1        Monday
2       Tuesday
3       Tuesday
4     Wednesday
5     Wednesday
6      Thursday
7      Thursday
8        Friday
9        Friday
10     Saturday
11     Saturday
12       Sunday
13       Sunday
Name: Day, dtype: object

### Sorting Data

In [214]:
coffee.sort_values(['Units Sold','Coffee Type'],ascending=[1,0])

Unnamed: 0,Day,Coffee Type,Units Sold
1,Monday,Espresso,10
3,Tuesday,Latte,20
5,Wednesday,Latte,25
0,Monday,Espresso,25
7,Thursday,Latte,30
2,Tuesday,Espresso,30
9,Friday,Latte,35
11,Saturday,Latte,35
13,Sunday,Latte,35
4,Wednesday,Espresso,35


In [215]:
# for index, row in coffee.iterrows():
# print (index)
# print(row)
# print('\n')

### Filtering Data

In [216]:
bios.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [217]:
bios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   athlete_id    145500 non-null  int64  
 1   name          145500 non-null  object 
 2   born_date     143693 non-null  object 
 3   born_city     110908 non-null  object 
 4   born_region   110908 non-null  object 
 5   born_country  110908 non-null  object 
 6   NOC           145499 non-null  object 
 7   height_cm     106651 non-null  float64
 8   weight_kg     102070 non-null  float64
 9   died_date     33940 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 11.1+ MB


In [218]:
#bios.loc[bios[ 'height_cm'] > 215, ['name', 'born_country','height_cm']]
# shorthand way 
new_bios = bios[bios['height_cm'] > 215]
new_bios.head(2)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5089,5108,Viktor Pankrashkin,1957-06-19,Moskva (Moscow),Moskva,RUS,Soviet Union,220.0,112.0,1993-07-24
5583,5606,Paulinho Villas Boas,1963-01-26,São Paulo,São Paulo,BRA,Brazil,217.0,106.0,


#### conditional

In [219]:
bios[(bios['height_cm'] > 215) & (bios['born_country'] == 'USA')]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,
6722,6755,Shaquille O'Neal,1972-03-06,Newark,New Jersey,USA,United States,216.0,137.0,
6937,6972,David Robinson,1965-08-06,Key West,Florida,USA,United States,216.0,107.0,
123850,126093,Tyson Chandler,1982-10-02,Hanford,California,USA,United States,216.0,107.0,


In [220]:
bios[bios['name'].str.contains('Lee')]
lee_names = bios[bios['name'].str.contains('Lee|Keith')]
lee_names.head(5)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
896,900,Lee Heung-Sun,1971-11-19,,,,Republic of Korea,,,
897,901,Lee Jeong-Im,1971-07-01,,,,Republic of Korea,,,
898,902,Lee Jeong-Myeong,1967-09-08,,,,Republic of Korea,170.0,60.0,
920,924,Lee Gwang-Jin,1970-12-05,,,,Republic of Korea,175.0,,
921,925,Lee Sang-Bok,1968-03-17,,,,Republic of Korea,,,


In [221]:
# Find athletes born in a year 2000:
born_2000 = bios[bios['born_date'].str.contains(r'^2000', na=False)]
born_2000.head(5)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
126497,128859,Fatima Hirech,2000-08-22,,,,Algeria,170.0,80.0,
126509,128872,Samantha Roberts,2000-04-21,Memphis,Tennessee,USA,Antigua and Barbuda,172.0,64.0,
126564,128931,Gayane Chiloyan,2000-09-27,Yerevan,Yerevan,ARM,Armenia,164.0,54.0,
126620,128993,Ellie Carpenter,2000-04-28,Cowra,New South Wales,AUS,Australia,165.0,59.0,
126676,129054,Aislin Jones,2000-02-08,Shepparton,Victoria,AUS,Australia,157.0,51.0,


In [222]:
isin = bios[bios['born_country'].isin(['GBR', 'USA']) & (bios['name'].str.startswith('Lee'))]
isin.head(5)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
18030,18151,Lee Carleton,1862-08-20,Cumberland,Maryland,USA,United States,,,1921-12-06
18054,18175,Lee Jones,1874-11-16,Chicago,Illinois,USA,United States,,,1937-08-11
23611,23795,Lee Shelley,1956-05-17,Beaumont,Texas,USA,United States,183.0,79.0,
29908,30134,Lee McDermott,1974-02-11,London,England,GBR,Great Britain,168.0,65.0,
54513,54905,Lee Case,1917-08-08,Curtis,Nebraska,USA,United States,,,1984-12-31


#### using Query

In [223]:
bios.query('born_country == "GBR" and born_city == "Manchester" and born_date > "1990-06-05"')

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
122292,124361,Rob Bale,1990-07-19,Manchester,England,GBR,Great Britain,174.0,69.0,
125424,127769,Bex Wilson,1991-03-17,Manchester,England,GBR,Great Britain,158.0,66.0,
127743,130217,Seren Bundy-Davies,1994-12-30,Manchester,England,GBR,Great Britain,175.0,63.0,
139478,142951,Aimee Pratt,1997-10-03,Manchester,England,GBR,Great Britain,,,
139502,142975,Charlotte Worthington,1996-06-26,Manchester,England,GBR,Great Britain,,,
139609,143085,Georgia Taylor-Brown,1994-03-15,Manchester,England,GBR,Great Britain,,,


### Adding/Removing Columns

In [224]:
import numpy as np
coffee['New_Price'] = np.where(coffee['Coffee Type']== 'Espresso', 3.99, 5.99) # add new column named 'New_Price'

In [225]:
coffee['Revenue']= coffee['Units Sold'] * coffee['New_Price']    # add new column named 'Revenue' based on 2 other columns                

In [226]:
coffee = coffee.rename(columns={'New_Price':'Price'}) # rename Column

In [227]:
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold,Price,Revenue
0,Monday,Espresso,25,3.99,99.75
1,Monday,Espresso,10,3.99,39.9
2,Tuesday,Espresso,30,3.99,119.7
3,Tuesday,Latte,20,5.99,119.8
4,Wednesday,Espresso,35,3.99,139.65


### Merge & Concatinating Data

In [228]:
nocs = pd.read_csv('./data/noc_regions.csv')

In [229]:
bios.head()


Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [230]:
nocs.head()

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,


In [231]:
bios_new =pd.merge(bios,nocs, left_on='born_country', right_on='NOC', how='left')

In [232]:
bios_new

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC_x,height_cm,weight_kg,died_date,NOC_y,region,notes
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02,FRA,France,
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,,FRA,France,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17,FRA,France,
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20,FRA,France,
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25,GBR,UK,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145495,149222,Polina Luchnikova,2002-01-30,Serov,Sverdlovsk,RUS,ROC,167.0,61.0,,RUS,Russia,
145496,149223,Valeriya Merkusheva,1999-09-20,Moskva (Moscow),Moskva,RUS,ROC,168.0,65.0,,RUS,Russia,
145497,149224,Yuliya Smirnova,1998-05-08,Kotlas,Arkhangelsk,RUS,ROC,163.0,55.0,,RUS,Russia,
145498,149225,André Foussard,1899-05-19,Niort,Deux-Sèvres,FRA,France,166.0,,1986-03-18,FRA,France,


In [233]:
bios_new.head(5)


Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC_x,height_cm,weight_kg,died_date,NOC_y,region,notes
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02,FRA,France,
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,,FRA,France,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17,FRA,France,
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20,FRA,France,
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25,GBR,UK,


In [234]:
bios_new.rename(columns={'region': 'born_country_full'}, inplace=True )

In [235]:
bios_new

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC_x,height_cm,weight_kg,died_date,NOC_y,born_country_full,notes
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02,FRA,France,
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,,FRA,France,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17,FRA,France,
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20,FRA,France,
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25,GBR,UK,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145495,149222,Polina Luchnikova,2002-01-30,Serov,Sverdlovsk,RUS,ROC,167.0,61.0,,RUS,Russia,
145496,149223,Valeriya Merkusheva,1999-09-20,Moskva (Moscow),Moskva,RUS,ROC,168.0,65.0,,RUS,Russia,
145497,149224,Yuliya Smirnova,1998-05-08,Kotlas,Arkhangelsk,RUS,ROC,163.0,55.0,,RUS,Russia,
145498,149225,André Foussard,1899-05-19,Niort,Deux-Sèvres,FRA,France,166.0,,1986-03-18,FRA,France,


In [236]:
bios_new[bios_new['NOC_x'] != bios_new['born_country_full']][['name','NOC_x','born_country_full']]

Unnamed: 0,name,NOC_x,born_country_full
4,Albert Canet,France,UK
12,J. Defert,France,
13,Étienne Durand,France,
16,Guy Forget,France,Morocco
27,"Guy, Baron Lejeune",France,
...,...,...,...
145491,Matthew Wepke,Jamaica,
145493,Landysh Falyakhova,ROC,Russia
145495,Polina Luchnikova,ROC,Russia
145496,Valeriya Merkusheva,ROC,Russia


In [237]:
usa = bios[bios['born_country']=='USA'].copy()
gbr = bios[bios['born_country']=='GBR'].copy()

In [238]:
usa.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
54,55,Monique Javer,1967-07-22,Burlingame,California,USA,Great Britain,177.0,64.0,
960,964,Xóchitl Escobedo,1968-09-17,West Covina,California,USA,Mexico,170.0,60.0,
961,965,Angélica Gavaldón,1973-10-03,El Centro,California,USA,Mexico,160.0,54.0,
1231,1238,Bert Schneider,1897-07-01,Cleveland,Ohio,USA,Canada,,,1986-02-20
1345,1352,Laura Berg,1975-01-06,Santa Fe Springs,California,USA,United States,168.0,61.0,


In [239]:
gbr.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25
37,38,Helen Aitchison,1881-12-06,Sunderland,England,GBR,Great Britain,,,1947-05-26
38,39,Geraldine Beamish,1883-06-23,Forest Gate,England,GBR,Great Britain,,,1972-05-10
39,40,Dora Boothby,1881-08-02,Finchley,England,GBR,Great Britain,,,1970-02-22
40,41,Julie Bradbury,1967-02-12,Oxford,England,GBR,Great Britain,175.0,64.0,


In [240]:
born_city = bios[bios['born_city']=='Salford'].copy()
born_city

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
177,178,Demetrius Casdagli,1872-10-10,Salford,England,GBR,Greece,,,1931-07-06
528,530,Ethel Armitage,1873-06-21,Salford,England,GBR,Great Britain,,,1957-10-17
9722,9777,Paul Ratcliffe,1973-11-12,Salford,England,GBR,Great Britain,180.0,72.0,
42066,42389,Thomas Northcote,1893-11-30,Salford,England,GBR,Great Britain,,,1991-01-01
55356,55751,David Mercer,1961-04-16,Salford,England,GBR,Great Britain,170.0,89.0,
57758,58166,Matthew Clempner,1956-05-20,Salford,England,GBR,Great Britain,205.0,100.0,
60990,61432,David Bowker,1922-03-15,Salford,England,GBR,Great Britain,,,2020-03-18
68164,68680,Shelley Holroyd,1973-05-17,Salford,England,GBR,Great Britain,176.0,65.0,
68508,69025,William Eaton,1909-04-20,Salford,England,GBR,Great Britain,164.0,57.0,1938-04-01
68592,69109,Frank Handley,1910-10-31,Salford,England,GBR,Great Britain,178.0,64.0,1985-10-31


In [241]:
new_df = pd.concat([usa,gbr])
new_df.head()


Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
54,55,Monique Javer,1967-07-22,Burlingame,California,USA,Great Britain,177.0,64.0,
960,964,Xóchitl Escobedo,1968-09-17,West Covina,California,USA,Mexico,170.0,60.0,
961,965,Angélica Gavaldón,1973-10-03,El Centro,California,USA,Mexico,160.0,54.0,
1231,1238,Bert Schneider,1897-07-01,Cleveland,Ohio,USA,Canada,,,1986-02-20
1345,1352,Laura Berg,1975-01-06,Santa Fe Springs,California,USA,United States,168.0,61.0,


In [242]:
new_df.tail()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
144811,148512,Benjamin Alexander,1983-05-08,London,England,GBR,Jamaica,,,
144815,148517,Ashley Watson,1993-10-28,Peterborough,England,GBR,Jamaica,,,
145005,148716,Peder Kongshaug,2001-08-13,Wimbledon,England,GBR,Norway,184.0,86.0,
145319,149041,Axel Brown,1992-04-02,Harrogate,England,GBR,Trinidad and Tobago,,,
145388,149111,Jean-Luc Baker,1993-10-07,Burnley,England,GBR,United States,,,


In [243]:
results.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [244]:
combined_df = pd.merge(results,bios, on='athlete_id', how='left')

In [245]:
combined_df.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02


### Handling Null Values

In [249]:
coffee.loc[[0,1],'Units Sold'] = np.nan

In [252]:
coffee

Unnamed: 0,Day,Coffee Type,Units Sold,Price,Revenue
0,Monday,Espresso,,3.99,99.75
1,Monday,Espresso,,3.99,39.9
2,Tuesday,Espresso,30.0,3.99,119.7
3,Tuesday,Latte,20.0,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65
5,Wednesday,Latte,25.0,5.99,149.75
6,Thursday,Espresso,40.0,3.99,159.6
7,Thursday,Latte,30.0,5.99,179.7
8,Friday,Espresso,45.0,3.99,179.55
9,Friday,Latte,35.0,5.99,209.65


In [253]:
coffee.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Day          14 non-null     object 
 1   Coffee Type  14 non-null     object 
 2   Units Sold   12 non-null     float64
 3   Price        14 non-null     float64
 4   Revenue      14 non-null     float64
dtypes: float64(3), object(2)
memory usage: 692.0+ bytes


In [254]:
coffee.isna().sum()  # Count number of NA's

Day            0
Coffee Type    0
Units Sold     2
Price          0
Revenue        0
dtype: int64

In [257]:
coffee = coffee.fillna(coffee['Units Sold'].mean())

In [258]:
coffee

Unnamed: 0,Day,Coffee Type,Units Sold,Price,Revenue
0,Monday,Espresso,35.0,3.99,99.75
1,Monday,Espresso,35.0,3.99,39.9
2,Tuesday,Espresso,30.0,3.99,119.7
3,Tuesday,Latte,20.0,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65
5,Wednesday,Latte,25.0,5.99,149.75
6,Thursday,Espresso,40.0,3.99,159.6
7,Thursday,Latte,30.0,5.99,179.7
8,Friday,Espresso,45.0,3.99,179.55
9,Friday,Latte,35.0,5.99,209.65
