<img src="img/python-logo-notext.svg"
     style="display:block;margin:auto;width:10%"/>
<br>
<div style="text-align:center; font-size:200%;"><b>Pandas Data Frames 1</b></div>
<br/>
<div style="text-align:center;">Dr. Matthias Hölzl</div>

# Data Frames

Data frames are the most commonly used Pandas data structure. They allow us to
easily read, process and save table-based data

Conceptually, a data frame consists of multiple series instances sharing a
common index.

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

In [None]:
pandas_dir_path = Path(r"C:\Users\tc\Programming\Python\Courses\Own\python-programmierer\Data\Pandas")

## Creating Data Frames

### From a NumPy Array

In [None]:
def create_data_frame():
    rng = np.random.default_rng(42)
    array = rng.normal(size=(5, 4), scale=5.0)
    index = "A B C D E".split()
    columns = "w x y z".split()
    return pd.DataFrame(array, index=index, columns=columns)

In [None]:
df = create_data_frame()
df

In [None]:
type(df)

### From a CSV File

In [None]:
df_csv = pd.read_csv(pandas_dir_path / "example_data.csv")

In [None]:
df_csv

In [None]:
df_csv = pd.read_csv(pandas_dir_path / "example_data.csv", index_col=0)

In [None]:
df_csv

### From an Excel File

In [None]:
df_excel = pd.read_excel(pandas_dir_path / "excel_data.xlsx", index_col=0)

In [None]:
df_excel

In [None]:
df_excel2 = pd.read_excel(pandas_dir_path / "excel_other_sheet.xlsx", index_col=0)

In [None]:
df_excel2

In [None]:
df_excel2 = pd.read_excel(
    pandas_dir_path / "excel_other_sheet.xlsx",
    index_col=0,
    sheet_name="Another Sheet",
    header=0,
    skiprows=[1]
)

In [None]:
df_excel2.head()


### Other Formats:

- `pd.read_clipboard`
- `pd.read_html`
- `pd.read_json`
- `pd.read_pickle`
- `pd.read_sql` (uses SQLAlchemy to access a database)
- ...

# Plotting data frames

In [None]:
df_csv["Col 0"].hist(bins=15);

In [None]:
df_csv.hist(bins=20, figsize=(12, 8));

In [None]:
df_csv.plot(kind="scatter", x="Col 1", y="Col 2");

In [None]:
df_csv.plot(kind="scatter", x="Col 1", y="Col 2", c="Col 3", cmap="hot");

### Indices and Operations

In [None]:
df_csv.head()

In [None]:
df_csv.tail()

In [None]:
df = create_data_frame()
df["w"]

In [None]:
type(df["w"])

In [None]:
# Use only interactively
df.w

In [None]:
df[["w", "y"]]

In [None]:
df.index

In [None]:
df.index.is_monotonic_increasing

In [None]:
df.size

In [None]:
df.ndim

In [None]:
df.shape

### Creating, renaming, deleting columns

In [None]:
df = create_data_frame()
df["Sum of w and y"] = df["w"] + df["y"]

In [None]:
df

In [None]:
df.rename(columns={"Sum of w and y": "w + y"})

In [None]:
df

In [None]:
df.rename(columns={"Sum of w and y": "w + y"}, index={"E": "Z"}, inplace=True)

In [None]:
df

In [None]:
type(df["y"])

In [None]:
del df["y"]

In [None]:
df

In [None]:
df.drop("A")

In [None]:
df

In [None]:
df.drop("B", inplace=True)

In [None]:
df

In [None]:
df.drop("z", axis=1)

In [None]:
df

In [None]:
df.drop("z", axis=1, inplace=True)

In [None]:
df

## Selection

In [None]:
df = create_data_frame()
df

In [None]:
df["w"]

In [None]:
# Error!
# df['A']

In [None]:
df.loc["B"]

In [None]:
type(df.loc["B"])

In [None]:
df

In [None]:
df.iloc[1]

In [None]:
df.loc[["A", "C"]]

In [None]:
df.loc[["A", "C"], ["x", "y"]]

In [None]:
df.loc["B", "z"]

In [None]:
df.iloc[[1, 2], [0, 3]]

In [None]:
df.iloc[0, 0]

## Conditional Selection

In [None]:
df = create_data_frame()
df

In [None]:
df > 0  # noqa

In [None]:
df[df > 0]

In [None]:
df["w"] > 0  # noqa

In [None]:
df[df["w"] > 0]

In [None]:
df[df["w"] > 0][["x", "y"]]

In [None]:
df[(df["w"] > 0) & (df["x"] < 0)]


# Information about Data Frames

In [None]:
df = create_data_frame()
df["txt"] = "a b c d e".split()
df.iloc[1, 1] = np.nan
df

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.dtypes

## Data Frame Index

In [None]:
df = create_data_frame()
df["txt"] = "a b c d e".split()
df

In [None]:
df.reset_index()

In [None]:
df

In [None]:
df.reset_index(inplace=True)

In [None]:
df

In [None]:
df.rename(columns={"index": "old_index"}, inplace=True)

In [None]:
df

In [None]:
df.set_index("txt")

In [None]:
df

In [None]:
df.set_index("txt", inplace=True)
df

In [None]:
df.set_index("old_index", inplace=True)
df

In [None]:
df.info()

In [None]:
df.index

In [None]:
df.index.name = None

In [None]:
df