In [8]:
import pandas as pd
import re
!pip install pyarrow
from pyarrow import parquet as pq
from io import StringIO

# Reading from a URL
students = pd.read_csv("https://pos.it/r4ds-students-csv")
print(students)

# Reading from a local file and replacing specific values with NaN
students = pd.read_csv("data/students.csv", na_values=["N/A", ""])
print(students)

# Renaming columns
students = students.rename(columns={'Student ID': 'student_id', 'Full Name': 'full_name'})
print(students)

# Using a snake_case convention for column names
students.columns = students.columns.str.lower().str.replace(" ", "_")

# Mutating meal_plan column
if 'meal_plan' in students.columns:
    students['meal_plan'] = students['meal_plan'].astype('category')
else:
    print("Column 'meal_plan' not found in the DataFrame.")

# Parsing age column to replace 'five' with 5
students['age'] = students['age'].replace('five', '5').astype(float)
print(students)

# Reading CSV from a string
df_from_str = pd.read_csv(StringIO("""
a,b,c
1,2,3
4,5,6
"""))
print(df_from_str)

# Skipping rows
df_with_skip = pd.read_csv(StringIO("""
The first line of metadata
The second line of metadata
x,y,z
1,2,3
"""), skiprows=2)
print(df_with_skip)

# Reading CSV with comment
df_with_comment = pd.read_csv(StringIO("""
# A comment I want to skip
x,y,z
1,2,3
"""), comment='#')
print(df_with_comment)

# Using default column names
df_without_colnames = pd.read_csv(StringIO("""
1,2,3
4,5,6
"""), header=None)
print(df_without_colnames)

# Specifying column names
df_with_colnames = pd.read_csv(StringIO("""
1,2,3
4,5,6
"""), names=["x", "y", "z"])
print(df_with_colnames)

# Reading CSV with mixed data types
mixed_df = pd.read_csv(StringIO("""
logical,numeric,date,string
TRUE,1,2021-01-15,abc
false,4.5,2021-02-15,def
T,Inf,2021-02-16,ghi
"""))
print(mixed_df)

# Reading CSV with specific NA values
simple_csv = """
x
10
.
20
30"""
df_with_na = pd.read_csv(StringIO(simple_csv), na_values=["."])
print(df_with_na)

# Reading from multiple files
sales_files = [
    "https://pos.it/r4ds-01-sales",
    "https://pos.it/r4ds-02-sales",
    "https://pos.it/r4ds-03-sales"
]
dfs = [pd.read_csv(url) for url in sales_files]
merged_df = pd.concat(dfs, keys=sales_files, names=['file'])

print(students)

# Writing to CSV
students.to_csv("data/students-2.csv", index=False)

# Reading from the written CSV
students_from_csv = pd.read_csv("data/students-2.csv")
print(students_from_csv)

# Writing and reading from RDS format (Python doesn't support RDS natively)

# Writing to parquet format
students.to_parquet("data/students.parquet", index=False)

# Reading from parquet format
students_from_parquet = pq.read_table("data/students.parquet").to_pandas()
print(students_from_parquet)

# Creating DataFrame using dict
df_from_dict = pd.DataFrame({
    'x': [1, 2, 5],
    'y': ['h', 'm', 'g'],
    'z': [0.08, 0.83, 0.60]
})
print(df_from_dict)


   Student ID         Full Name      favourite.food             mealPlan   AGE
0           1    Sunil Huffmann  Strawberry yoghurt           Lunch only     4
1           2      Barclay Lynn        French fries           Lunch only     5
2           3     Jayendra Lyne                 NaN  Breakfast and lunch     7
3           4      Leon Rossini           Anchovies           Lunch only   NaN
4           5  Chidiegwu Dunkel               Pizza  Breakfast and lunch  five
5           6     Güvenç Attila           Ice cream           Lunch only     6
   Unnamed: 0  Student ID         Full Name      favourite.food  \
0           1           1    Sunil Huffmann  Strawberry yoghurt   
1           2           2      Barclay Lynn        French fries   
2           3           3     Jayendra Lyne                 NaN   
3           4           4      Leon Rossini           Anchovies   
4           5           5  Chidiegwu Dunkel               Pizza   
5           6           6     Güvenç Attila  