In [1]:
import time

import pandas
import numpy
import math

from typing import List, Sized

from IPython.display import Markdown
from pandas.api.types import CategoricalDtype

# You can enrich your notebook with Markdown

In [2]:
mtcars = pandas.read_stata("./mtcars.dta")

In [4]:
higher_gear_manual_cars = mtcars[(mtcars["gear"] > 3) & (mtcars["am"] == 1)]
higher_gear_manual_cars

Unnamed: 0,name,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
17,Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
18,Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
19,Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
25,Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,1,4,1
26,Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,1,5,2
27,Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
28,Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4


In [5]:
higher_gear_manual_cars[["name", "mpg"]][higher_gear_manual_cars["mpg"] == higher_gear_manual_cars["mpg"].max()]

Unnamed: 0,name,mpg
19,Toyota Corolla,33.9


In [6]:
higher_gear_manual_cars[["name", "mpg"]].loc[[higher_gear_manual_cars["mpg"].idxmax()]]

Unnamed: 0,name,mpg
19,Toyota Corolla,33.9


In [7]:
Markdown("""
# Miles per gallon
- The {} is the car with the lowest miles per gallon value.
- The {} is the car with the highest miles per gallon value.
""".format(
    higher_gear_manual_cars["name"][higher_gear_manual_cars["mpg"].idxmin()],
    higher_gear_manual_cars["name"][higher_gear_manual_cars["mpg"].idxmax()]
)
)



# Miles per gallon
- The Maserati Bora is the car with the lowest miles per gallon value.
- The Toyota Corolla is the car with the highest miles per gallon value.


In [8]:
Markdown("""Existing gear shift ratios: `{}`

Mean miles per gallon for the different gear ratios and transmissions:
```
{}
```
""".format(
    sorted(mtcars["gear"].unique()),
    mtcars[["gear", "am", "mpg"]].groupby(["gear", "am"]).mean()
)
)

Existing gear shift ratios: `[3, 4, 5]`

Mean miles per gallon for the different gear ratios and transmissions:
```
               mpg
gear am           
3    0   16.106667
4    0   21.050000
     1   26.275000
5    1   21.380000
```


In [9]:
mtcars["am"].dtype

dtype('int32')

In [None]:
pandas.Series(pandas.Categorical.from_codes(mtcars["am"], categories=["automatic", "manual"])).dtype

# NumPy

In [12]:

array = numpy.empty(10000000, dtype=numpy.int64)
start = time.time()
print("NumPy value:", array.mean())
end = time.time()
print("NumPy time:", end - start)

start = time.time()
python_list = [int(item) for item in array]
print("Vanilla Python value:", sum(python_list)/len(array))
end = time.time()
print("Vanilla Python time:", end - start)


NumPy value: 0.0
NumPy time: 0.012920618057250977
Vanilla Python value: 0.0
Vanilla Python time: 1.0997698307037354


In [20]:
numpy_mtcars = mtcars.to_numpy()
numpy_mtcars

array([['Mazda RX4', 21.0, 6, 160.0, 110, 3.9, 2.62, 16.46, 0, 1, 4, 4],
       ['Mazda RX4 Wag', 21.0, 6, 160.0, 110, 3.9, 2.875, 17.02, 0, 1, 4,
        4],
       ['Datsun 710', 22.8, 4, 108.0, 93, 3.85, 2.32, 18.61, 1, 1, 4, 1],
       ['Hornet 4 Drive', 21.4, 6, 258.0, 110, 3.08, 3.215, 19.44, 1, 0,
        3, 1],
       ['Hornet Sportabout', 18.7, 8, 360.0, 175, 3.15, 3.44, 17.02, 0,
        0, 3, 2],
       ['Valiant', 18.1, 6, 225.0, 105, 2.76, 3.46, 20.22, 1, 0, 3, 1],
       ['Duster 360', 14.3, 8, 360.0, 245, 3.21, 3.57, 15.84, 0, 0, 3, 4],
       ['Merc 240D', 24.4, 4, 146.7, 62, 3.69, 3.19, 20.0, 1, 0, 4, 2],
       ['Merc 230', 22.8, 4, 140.8, 95, 3.92, 3.15, 22.9, 1, 0, 4, 2],
       ['Merc 280', 19.2, 6, 167.6, 123, 3.92, 3.44, 18.3, 1, 0, 4, 4],
       ['Merc 280C', 17.8, 6, 167.6, 123, 3.92, 3.44, 18.9, 1, 0, 4, 4],
       ['Merc 450SE', 16.4, 8, 275.8, 180, 3.07, 4.07, 17.4, 0, 0, 3, 3],
       ['Merc 450SL', 17.3, 8, 275.8, 180, 3.07, 3.73, 17.6, 0, 0, 3, 3],
      

In [16]:
numpy_mtcars.ndim

2

In [17]:
numpy_mtcars.size

384

In [18]:
numpy_mtcars.shape

(32, 12)

In [19]:
numpy.rot90(numpy_mtcars, k=-1)

array([['Volvo 142E', 'Maserati Bora', 'Ferrari Dino', 'Ford Pantera L',
        'Lotus Europa', 'Porsche 914-2', 'Fiat X1-9', 'Pontiac Firebird',
        'Camaro Z28', 'AMC Javelin', 'Dodge Challenger', 'Toyota Corona',
        'Toyota Corolla', 'Honda Civic', 'Fiat 128', 'Chrysler Imperial',
        'Lincoln Continental', 'Cadillac Fleetwood', 'Merc 450SLC',
        'Merc 450SL', 'Merc 450SE', 'Merc 280C', 'Merc 280', 'Merc 230',
        'Merc 240D', 'Duster 360', 'Valiant', 'Hornet Sportabout',
        'Hornet 4 Drive', 'Datsun 710', 'Mazda RX4 Wag', 'Mazda RX4'],
       [21.4, 15.0, 19.7, 15.8, 30.4, 26.0, 27.3, 19.2, 13.3, 15.2, 15.5,
        21.5, 33.9, 30.4, 32.4, 14.7, 10.4, 10.4, 15.2, 17.3, 16.4, 17.8,
        19.2, 22.8, 24.4, 14.3, 18.1, 18.7, 21.4, 22.8, 21.0, 21.0],
       [4, 8, 6, 8, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 6,
        6, 4, 4, 8, 6, 8, 6, 4, 6, 6],
       [121.0, 301.0, 145.0, 351.0, 95.1, 120.3, 79.0, 400.0, 350.0,
        304.0, 318.0, 120.1

In [22]:
def weighted_avg_and_std(values: Sized, weights: Sized) -> List[float]:
    """Return the weighted average and standard deviation."""
    average = numpy.average(values, weights=weights, axis=0)[0]
    # Fast and numerically precise:
    variance = numpy.average((values - average) ** 2, weights=weights, axis=0)
    sd = math.sqrt(variance)
    error = 1.96 * sd / math.sqrt(len(values))
    return [average, math.sqrt(variance), len(values), error]

In [23]:
ids = ["{:04d}".format(id) for id in range(1, 2001)]
years = range(1984, 2021)
income_table = pandas.DataFrame({"pid": [], "syear": []})

for year in years:
    income_table = income_table.append(
     pandas.DataFrame({"pid": ids, "syear": [year]*len(ids)})
    )

random_generator = numpy.random.default_rng(42)
income_table["income"] = random_generator.integers(low=1000, high=10000, size=(len(income_table["pid"]),), )
income_table["weight"] = random_generator.uniform(low=1, high=2, size=(len(income_table["pid"]),), )

In [24]:

income_grouped_by = income_table[["syear", "income", "weight"]].groupby(
        ["syear"]
    )

# Calculate weighted average
aggregated = income_grouped_by.apply(
        lambda data_frame: pandas.Series(
            weighted_avg_and_std(
                data_frame[["income"]].to_numpy(), weights=data_frame["weight"]
            ),
            ["mean_income", "sd", "N", "error"],
        )
    ).reset_index()
# Calculate confidence interval boundaries
aggregated["lower"] = aggregated["mean_income"] - aggregated["error"]
aggregated["upper"] = aggregated["mean_income"] + aggregated["error"]


In [None]:
series = pandas.Series([2, 3, numpy.nan, None, 4])
series

In [None]:
series.fillna("missing")

In [None]:
(series > 0) | (series < 0) | (series == 0)

In [None]:
series