# Pandas Input/output

Can read: `.csv`, `.json`, `.xlsx`, `.parquet`, `.db`, `.hdf`, ...


https://pandas.pydata.org/pandas-docs/stable/reference/io.html

In [None]:
import pandas as pd

### Our Data

- `attacks.csv` from Kaggle
- `github_pulls.json` from GitHub API
- `2023_Accidentalidad.xlsx` from Datos Abiertos Ayuntamiento de Madrid
- `test.parquet` from Kaggle

__.CSV Files__

In [None]:
# Raw "Comma"-Separated Values file (a.k.a.: CSV file)

with open('./datasets/attacks.csv') as f:
    lines = f.readlines()
lines

In [None]:
# Import .CSV 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#

df_csv = pd.read_csv('./datasets/attacks.csv')
df_csv.head()

In [None]:
# Codecs

codecs = pd.read_csv('./datasets/pandas_codecs.csv')
all_codecs = len(codecs['Codec'])

print(all_codecs)
codecs

In [None]:
# Searching the right codec

good_codec = []
for codec in codecs['Codec']:
    try:
        df_csv = pd.read_csv('./datasets/attacks.csv', encoding=codec)
        good_codec.append(codec)
    except:
        pass

f'{len(good_codec)} out of {all_codecs} codecs can parse the information'

In [None]:
#Finally

df_csv = pd.read_csv('./datasets/attacks.csv', encoding=good_codec[0])
df_csv.head()

In [None]:
# Let's explore the dataset

print(df_csv.shape)
#print(df_csv.info())
#df_csv_short = df_csv[['Date', 'Type', 'Country', 'Injury']]
#df_csv_short = df_csv_short.dropna(subset=['Country'])
#print(df_csv_short.shape)
#print(df_csv_short.info())

In [None]:
# Create a new .csv file

df_csv_short.to_csv('./datasets/outputs/shark_attacks_short.csv', sep=';', index=False)

In [None]:
df_csv_s = pd.read_csv('./datasets/outputs/shark_attacks_short.csv')

df_csv_s.head()

---

__.JSON Files__

In [None]:
# Raw JavaScript Object Notation file (a.k.a.: JSON file)

import json

with open('./datasets/github_pulls.json', encoding="utf8") as f:
    json_file = json.load(f)

json_file

In [None]:
# Import .JSON 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.io.json.read_json.html#pandas.io.json.read_json

df_json = pd.read_json('./datasets/github_pulls.json')
df_json

In [None]:
# Let's explore the dataset

print(df_json.shape)
print(df_json.info())

In [None]:
# Dict flatten

df_json_new = list(df_json['_links'])
df_json_new = pd.DataFrame(df_json_new)
df_json_new.head()

In [None]:
# Create a new .JSON file

df_json_new.to_json('./datasets/outputs/github_pulls_new.json')

---

__.XLSX Files__

In [None]:
# Additional libraries for Excel files

#!conda install -y xlrd
#!conda install -y openpyxl

In [None]:
%%time

# Import .XLSX 
# https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html

df_excel = pd.read_excel('./datasets/2024_Accidentalidad.xlsx', sheet_name='Hoja1')
df_excel.head()

In [None]:
# Let's explore the dataset

print(df_excel.shape)
print(df_excel.info())

---

__.PARQUET Files__

In [None]:
# Additional library for Parquet files

#!conda install -c conda-forge pyarrow

In [None]:
%%time

# Import .PARQUET (column-oriented data storage format with schema)
# https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html

df_parquet = pd.read_parquet('../../../data/test.parquet')
df_parquet.head()

In [None]:
# Let's explore the dataset (https://www.kaggle.com/dschettler8845/recsys-2020-ecommerce-dataset)

print(df_parquet.shape)
print(df_parquet.info())

---

__SQL Files...in the next episode...__