In [None]:
import time

import pandas
import numpy
import math

from typing import List, Sized

from IPython.display import Markdown
from pandas.api.types import CategoricalDtype

# You can enrich your notebook with Markdown

In [None]:
mtcars = pandas.read_stata("./mtcars.dta")

In [None]:
higher_gear_manual_cars = mtcars[(mtcars["gear"] > 3) & (mtcars["am"] == 1)]

In [None]:
higher_gear_manual_cars[["name", "mpg"]][higher_gear_manual_cars["mpg"] == higher_gear_manual_cars["mpg"].max()]

In [None]:
higher_gear_manual_cars[["name", "mpg"]].loc[[higher_gear_manual_cars["mpg"].idxmax()]]

In [None]:
Markdown("""
# Miles per gallon
- The {} is the car with the lowest miles per gallon value.
- The {} is the car with the highest miles per gallon value.
""".format(
    higher_gear_manual_cars["name"][higher_gear_manual_cars["mpg"].idxmin()],
    higher_gear_manual_cars["name"][higher_gear_manual_cars["mpg"].idxmax()]
)
)


In [None]:
Markdown("""Existing gear shift ratios: `{}`

Mean miles per gallon for the different gear ratios and transmissions:
```
{}
```
""".format(
    sorted(mtcars["gear"].unique()),
    mtcars[["gear", "am", "mpg"]].groupby(["gear", "am"]).mean()
)
)

In [None]:
mtcars["am"].dtype

In [None]:
pandas.Series(pandas.Categorical.from_codes(mtcars["am"], categories=["automatic", "manual"])).dtype

# NumPy

In [None]:

array = numpy.empty(101, dtype=numpy.int64)
start = time.time()
print("NumPy value:", array.mean())
end = time.time()
print("NumPy time:", end - start)

start = time.time()
python_list = [int(item) for item in array]
print("Vanilla Python value:", sum(python_list)/len(array))
end = time.time()
print("Vanilla Python time:", end - start)


In [None]:
numpy_mtcars = mtcars.to_numpy()

In [None]:
numpy_mtcars.ndim

In [None]:
numpy_mtcars.size

In [None]:
numpy_mtcars.shape

In [None]:
numpy.rot90(numpy_mtcars, k=-1)

In [None]:
def weighted_avg_and_std(values: Sized, weights: Sized) -> List[float]:
    """Return the weighted average and standard deviation."""
    average = numpy.average(values, weights=weights, axis=0)[0]
    # Fast and numerically precise:
    variance = numpy.average((values - average) ** 2, weights=weights, axis=0)
    sd = math.sqrt(variance)
    error = 1.96 * sd / math.sqrt(len(values))
    return [average, math.sqrt(variance), len(values), error]

In [None]:
ids = ["{:04d}".format(id) for id in range(1, 2001)]
years = range(1984, 2021)
income_table = pandas.DataFrame({"pid": [], "syear": []})

for year in years:
    income_table = income_table.append(
     pandas.DataFrame({"pid": ids, "syear": [year]*len(ids)})
    )

random_generator = numpy.random.default_rng(42)
income_table["income"] = random_generator.integers(low=1000, high=10000, size=(len(income_table["pid"]),), )
income_table["weight"] = random_generator.uniform(low=1, high=2, size=(len(income_table["pid"]),), )

In [None]:

income_grouped_by = income_table[["syear", "income", "weight"]].groupby(
        ["syear"]
    )
# Calculate weighted average
aggregated = income_grouped_by.apply(
        lambda data_frame: pandas.Series(
            weighted_avg_and_std(
                data_frame[["income"]].to_numpy(), weights=data_frame["weight"]
            ),
            ["mean_income", "sd", "N", "error"],
        )
    ).reset_index()
# Calculate confidence interval boundaries
aggregated["lower"] = aggregated["mean_income"] - aggregated["error"]
aggregated["upper"] = aggregated["mean_income"] + aggregated["error"]

In [None]:
series = pandas.Series([2, 3, numpy.nan, None, 4])
series

In [None]:
series.fillna("missing")

In [None]:
(series > 0) | (series < 0) | (series == 0)

In [None]:
series