In [1]:
import pandas as pd

# Reading the students Excel file
students = pd.read_excel("data/students.xlsx")
display(students.head())

# Reading the Excel file with specific column names
students = pd.read_excel(
    "data/students.xlsx",
    names=["student_id", "full_name", "favourite_food", "meal_plan", "age"],
    skiprows=1  # skip the first row as it may contain headers
)
display(students.head())

# Reading the Excel file with specific column names and NA values
students = pd.read_excel(
    "data/students.xlsx",
    names=["student_id", "full_name", "favourite_food", "meal_plan", "age"],
    skiprows=1,  # skip the first row as it may contain headers
    na_values=["", "N/A"]  # specify NA values
)
display(students.head())

# Reading the Excel file with specific data types
students = pd.read_excel(
    "data/students.xlsx",
    names=["student_id", "full_name", "favourite_food", "meal_plan", "age"],
    skiprows=1,  # skip the first row as it may contain headers
    na_values=["", "N/A"],  # specify NA values
    dtype={"student_id": int, "full_name": str, "favourite_food": str, "meal_plan": str, "age": str}
)
display(students.head())

# Cleaning the age column and converting strings to numbers
students['age'] = students['age'].replace('five', '5')
students['age'] = pd.to_numeric(students['age'], errors='coerce')
display(students.head())

# Reading specific sheets from the penguins Excel file
penguins_torgersen = pd.read_excel("data/penguins.xlsx", sheet_name="Torgersen Island", na_values="NA")
penguins_biscoe = pd.read_excel("data/penguins.xlsx", sheet_name="Biscoe Island", na_values="NA")
penguins_dream = pd.read_excel("data/penguins.xlsx", sheet_name="Dream Island", na_values="NA")

# Concatenating the different penguin dataframes into one
penguins = pd.concat([penguins_torgersen, penguins_biscoe, penguins_dream], ignore_index=True)

# Displaying the dimensions of each sheet and the combined dataframe
dimensions = {
    "penguins_torgersen": penguins_torgersen.shape,
    "penguins_biscoe": penguins_biscoe.shape,
    "penguins_dream": penguins_dream.shape,
    "penguins": penguins.shape
}

dimensions, penguins.head()

Unnamed: 0,Student ID,Full Name,favourite.food,mealPlan,AGE
0,1,Sunil Huffmann,Strawberry yoghurt,Lunch only,4
1,2,Barclay Lynn,French fries,Lunch only,5
2,3,Jayendra Lyne,,Breakfast and lunch,7
3,4,Leon Rossini,Anchovies,Lunch only,
4,5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,five


Unnamed: 0,student_id,full_name,favourite_food,meal_plan,age
0,2,Barclay Lynn,French fries,Lunch only,5
1,3,Jayendra Lyne,,Breakfast and lunch,7
2,4,Leon Rossini,Anchovies,Lunch only,
3,5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,five
4,6,Güvenç Attila,Ice cream,Lunch only,6


Unnamed: 0,student_id,full_name,favourite_food,meal_plan,age
0,2,Barclay Lynn,French fries,Lunch only,5
1,3,Jayendra Lyne,,Breakfast and lunch,7
2,4,Leon Rossini,Anchovies,Lunch only,
3,5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,five
4,6,Güvenç Attila,Ice cream,Lunch only,6


Unnamed: 0,student_id,full_name,favourite_food,meal_plan,age
0,2,Barclay Lynn,French fries,Lunch only,5
1,3,Jayendra Lyne,,Breakfast and lunch,7
2,4,Leon Rossini,Anchovies,Lunch only,
3,5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,five
4,6,Güvenç Attila,Ice cream,Lunch only,6


Unnamed: 0,student_id,full_name,favourite_food,meal_plan,age
0,2,Barclay Lynn,French fries,Lunch only,5.0
1,3,Jayendra Lyne,,Breakfast and lunch,7.0
2,4,Leon Rossini,Anchovies,Lunch only,
3,5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,5.0
4,6,Güvenç Attila,Ice cream,Lunch only,6.0


({'penguins_torgersen': (52, 8),
  'penguins_biscoe': (168, 8),
  'penguins_dream': (124, 8),
  'penguins': (344, 8)},
   species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
 0  Adelie  Torgersen            39.1           18.7              181.0   
 1  Adelie  Torgersen            39.5           17.4              186.0   
 2  Adelie  Torgersen            40.3           18.0              195.0   
 3  Adelie  Torgersen             NaN            NaN                NaN   
 4  Adelie  Torgersen            36.7           19.3              193.0   
 
    body_mass_g     sex  year  
 0       3750.0    male  2007  
 1       3800.0  female  2007  
 2       3250.0  female  2007  
 3          NaN     NaN  2007  
 4       3450.0  female  2007  )

In [2]:
# Reading the entire Excel file into a DataFrame
deaths_path = "data/deaths.xlsx"
deaths = pd.read_excel(deaths_path)
display(deaths.head())

# Reading a specific range from the Excel file
deaths_range = pd.read_excel(deaths_path, usecols="A:F", nrows=11, skiprows=4) # Range A5:F15
deaths_range

Unnamed: 0,Lots of people,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,simply cannot resist writing,,,,,some notes
1,at,the,top,,of,their spreadsheets
2,or,merging,,,,cells
3,Name,Profession,Age,Has kids,Date of birth,Date of death
4,David Bowie,musician,69,True,1947-01-08 00:00:00,2016-01-10 00:00:00


Unnamed: 0,Name,Profession,Age,Has kids,Date of birth,Date of death
0,David Bowie,musician,69.0,1.0,1947-01-08,2016-01-10
1,Carrie Fisher,actor,60.0,1.0,1956-10-21,2016-12-27
2,Chuck Berry,musician,90.0,1.0,1926-10-18,2017-03-18
3,Bill Paxton,actor,61.0,1.0,1955-05-17,2017-02-25
4,Prince,musician,57.0,1.0,1958-06-07,2016-04-21
5,Alan Rickman,actor,69.0,0.0,1946-02-21,2016-01-14
6,Florence Henderson,actor,82.0,1.0,1934-02-14,2016-11-24
7,Harper Lee,author,89.0,0.0,1926-04-28,2016-02-19
8,Zsa Zsa Gábor,actor,99.0,1.0,1917-02-06,2016-12-18
9,George Michael,musician,53.0,0.0,1963-06-25,2016-12-25


In [3]:
import pandas as pd

# Creating the bake_sale DataFrame equivalent to the tibble in R
bake_sale = pd.DataFrame({
    'item': pd.Categorical(["brownie", "cupcake", "cookie"]),
    'quantity': [10, 5, 8]
})

# Display the bake_sale DataFrame
display(bake_sale)

# Writing the bake_sale DataFrame to an Excel file
bake_sale_path = "data/bake-sale.xlsx"
bake_sale.to_excel(bake_sale_path, index=False)

# Reading the newly written bake-sale.xlsx to confirm it's written correctly
bake_sale_read = pd.read_excel(bake_sale_path)
display(bake_sale_read)

# Reading the survey.xlsx file
survey_path = "data/survey.xlsx"
survey = pd.read_excel(survey_path)
display(survey.head())

# Reading the roster.xlsx file
roster_path = "data/roster.xlsx"
roster = pd.read_excel(roster_path)
display(roster.head())

# Reading the sales.xlsx file skipping the first 3 rows
sales_path = "data/sales.xlsx"
sales = pd.read_excel(sales_path, skiprows=3)
sales.head()

Unnamed: 0,item,quantity
0,brownie,10
1,cupcake,5
2,cookie,8


Unnamed: 0,item,quantity
0,brownie,10
1,cupcake,5
2,cookie,8


Unnamed: 0,survey_id,n_pets
0,1,0
1,2,1
2,3,
3,4,two
4,5,2


Unnamed: 0,group,subgroup,id
0,1.0,A,1
1,,,2
2,,,3
3,,B,4
4,,,5


Unnamed: 0.1,Unnamed: 0,Unnamed: 1
0,Brand 1,n
1,1234,8
2,8721,2
3,1822,3
4,Brand 2,n
