In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)

url = (
    "http://biostat.mc.vanderbilt.edu/"
    "wiki/pub/Main/DataSets/titanic3.xls"
)
df = pd.read_excel(url)


In [2]:
X2 = pd.DataFrame(
    {
        "a": range(5),
        "b": [-100, -50, 0, 200, 1000],
    }
)
X2

Unnamed: 0,a,b
0,0,-100
1,1,-50
2,2,0
3,3,200
4,4,1000


In [3]:
from sklearn import preprocessing
std = preprocessing.StandardScaler()
std.fit_transform(X2)

array([[-1.41421356, -0.75995002],
       [-0.70710678, -0.63737744],
       [ 0.        , -0.51480485],
       [ 0.70710678, -0.02451452],
       [ 1.41421356,  1.93664683]])

In [4]:
std.scale_
std.mean_
std.var_

array([2.000e+00, 1.664e+05])

In [5]:
X_std = (X2 - X2.mean()) / X2.std()
X_std
X_std.mean()
X_std.std()

a    1.0
b    1.0
dtype: float64

In [7]:
X3 = X2.copy()
from fastai.structured import scale_vars
scale_vars(X3, mapper=None)
X3.std()
X3.mean()

ModuleNotFoundError: No module named 'fastai'

In [8]:
from sklearn import preprocessing
mms = preprocessing.MinMaxScaler()
mms.fit(X2)
mms.transform(X2)

array([[0.        , 0.        ],
       [0.25      , 0.04545455],
       [0.5       , 0.09090909],
       [0.75      , 0.27272727],
       [1.        , 1.        ]])

In [9]:
(X2 - X2.min()) / (X2.max() - X2.min())

Unnamed: 0,a,b
0,0.0,0.0
1,0.25,0.045455
2,0.5,0.090909
3,0.75,0.272727
4,1.0,1.0


In [10]:
X_cat = pd.DataFrame(
    {
        "imię": ["Grzegorz", "Paweł"],
        "instrument": ["Bas", "Gitara"],
    }
)
X_cat

Unnamed: 0,imię,instrument
0,Grzegorz,Bas
1,Paweł,Gitara


In [11]:
pd.get_dummies(X_cat, drop_first=True)

Unnamed: 0,imię_Paweł,instrument_Gitara
0,0,0
1,1,1


In [14]:
import janitor as jn
X_cat2 = pd.DataFrame(
    {
        "A": [1, None, 3],
        "imiona": [
            "Franek,Grzegorz",
            "Grzegorz",
            "Jerzy,Paweł",

        ],
    }
)
jn.expand_column(X_cat2, "imiona", sep=",")

Unnamed: 0,A,imiona,Franek,Grzegorz,Jerzy,Paweł
0,1.0,"Franek,Grzegorz",1,1,0,0
1,,Grzegorz,0,1,0,0
2,3.0,"Jerzy,Paweł",0,0,1,1


In [18]:
from sklearn import preprocessing
lab = preprocessing.LabelEncoder()
lab.fit_transform(X_cat.imię)

array([0, 1])

In [19]:
lab.inverse_transform([1, 1, 0])

array(['Paweł', 'Paweł', 'Grzegorz'], dtype=object)

In [21]:
X_cat.imię.astype(
    "category"
).cat.as_ordered().cat.codes + 1

0    1
1    2
dtype: int8

In [24]:
mapping = X_cat.imię.value_counts()
X_cat.imię.map(mapping)

0    1
1    1
Name: imię, dtype: int64

In [25]:
from collections import Counter
c = Counter()
def triples(val):
    for i in range(len(val)):
        c[val[i : i + 3]] += 1
df.name.apply(triples)
c.most_common(10)

[(', M', 1282),
 (' Mr', 954),
 ('r. ', 830),
 ('Mr.', 757),
 ('s. ', 460),
 ('n, ', 320),
 (' Mi', 283),
 ('iss', 261),
 ('ss.', 261),
 ('Mis', 260)]

In [26]:
df.name.str.extract(
    "([A-Za-z]+)\.", expand=False
).head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: name, dtype: object

In [27]:
df.name.str.extract(
    "([A-Za-z]+)\.", expand=False
).value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Ms            2
Major         2
Mlle          2
Countess      1
Jonkheer      1
Dona          1
Sir           1
Capt          1
Lady          1
Don           1
Mme           1
Name: name, dtype: int64

In [28]:
import category_encoders as ce
he = ce.HashingEncoder(verbose=1)
he.fit_transform(X_cat)

Unnamed: 0,imię,instrument
0,Grzegorz,Bas
1,Paweł,Gitara


In [29]:
size_df = pd.DataFrame(
    {
        "imię": ["Franek", "Jerzy", "Marek"],
        "wielkość": ["mała", "średnia", "XXL"],

    }
)
ore = ce.OrdinalEncoder(
    mapping=[
        {
            "col": "wielkość",
            "mapping": {
                "mała": 1,
                "średnia": 2,
                "duża": 3,
            },
        }
    ]
)
ore.fit_transform(size_df)

Unnamed: 0,imię,wielkość
0,Franek,1.0
1,Jerzy,2.0
2,Marek,-1.0


In [30]:
def get_title(df):
    return df.name.str.extract(
        "([A-Za-z]+)\.", expand=False
    )
te = ce.TargetEncoder(cols="Title")
te.fit_transform(
    df.assign(Title=get_title), df.survived
)["Title"].head()

0    0.676923
1    0.508197
2    0.676923
3    0.162483
4    0.786802
Name: Title, dtype: float64

In [31]:
from fastai.tabular.transform import (
    add_datepart,
)
dates = pd.DataFrame(
    {
        "A": pd.to_datetime(
            ["9/17/2001", "Jan 1, 2002"]
        )
    }
)
add_datepart(dates, "A")
dates.T

ModuleNotFoundError: No module named 'fastai'

In [32]:
from pandas.api.types import is_numeric_dtype
def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (
            name in na_dict
        ):
            df[name + "_na"] = pd.isnull(col)
            filler = (
                na_dict[name]
                if name in na_dict
                else col.median()
            )
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict
data = pd.DataFrame({"A": [0, None, 5, 100]})
fix_missing(data, data.A, "A", {})
data

Unnamed: 0,A,A_na
0,0.0,False
1,5.0,True
2,5.0,False
3,100.0,False


In [33]:
data = pd.DataFrame({"A": [0, None, 5, 100]})
data["A_na"] = data.A.isnull()
data["A"] = data.A.fillna(data.A.median())

In [34]:
agg = (
    df.groupby("cabin")
    .agg("min,max,mean,sum".split(","))
    .reset_index()
)
agg.columns = [
    "_".join(c).strip("_")
    for c in agg.columns.values
]
agg_df = df.merge(agg, on="cabin")