In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import yaml
from datetime import datetime
import plotly.express as px

In [None]:
rectangle = pd.read_csv('data/rectangle_data.csv')
rectangle

In [None]:
X = rectangle - np.mean(rectangle, axis = 0)
X.head(10)

In [None]:
X = X / np.std(X, axis=0)

In [None]:
U, S, Vt = np.linalg.svd(X, full_matrices = False)

In [None]:
print("Shape of U", U.shape)
print("Shape of S", S.shape)
print("Shape of Vt", Vt.shape)

In [None]:
Sm = np.diag(S)
Sm

In [None]:
np.isclose(S[3], 0)

In [None]:
S.round(5)

In [None]:
pd.DataFrame(np.round(np.diag(S), 3))

In [None]:
pd.DataFrame(np.round(S**2 / X.shape[0], 3))

In [None]:
Z = U[:, :2] @ np.diag(S[:2])
pd.DataFrame(Z).head()

In [None]:
Z = X.to_numpy() @ Vt.T[:,:2]
pd.DataFrame(Z).head()

In [None]:
px.scatter(x=Z[:,0], y=Z[:,1], render_mode="svg")

In [None]:
from sklearn.decomposition import PCA
pca = PCA(2)
pd.DataFrame(pca.fit_transform(X)).head(5)

In [None]:
pd.DataFrame(Z).head()

In [None]:
pd.DataFrame(pca.fit_transform(X)).head(5)

In [None]:
pd.DataFrame(np.cov(Z.T))

In [None]:
rectangle.head()

In [None]:
k = 2
Y,S, Vt = np.linalg.svd(X, full_matrices = False)

Z = U[:, :k] @ np.diag(S[:k])

rectangle_hat = pd.DataFrame(Z @ Vt[:k, :], columns = rectangle.columns)

rectangle_hat = rectangle_hat * np.std(rectangle, axis=0) + np.mean(rectangle, axis = 0)

fig = px.scatter_3d(rectangle, x="width", y="height", z="area",
                    width=800, height=600)
fig.add_scatter3d(x=rectangle_hat["width"],
                  y = rectangle_hat["height"],
                  z=rectangle_hat["area"],
                  mode="markers", name = "approximation")

In [None]:
votes = pd.read_csv('data/votes.csv')
votes = votes.astype({"roll call": str})
votes

In [None]:
def was_yes(s):
  return 1 if s.iloc[0] == "Yes" else 0
vote_pivot = votes.pivot_table(index='member',
                               columns='roll call',
                               values='vote',
                               aggfunc=was_yes,
                               fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()

In [None]:
vote_pivot_centered = vote_pivot - np.mean(vote_pivot, axis = 0)
vote_pivot_centered

In [None]:
vote_pivot_centered.shape

In [None]:
u, s, vt = np.linalg.svd(vote_pivot_centered, full_matrices = False)

In [None]:
print("u.shape", u.shape)
print("s.shape", s.shape)
print("vt.shape", vt.shape)

In [None]:
vote_2d = pd.DataFrame(index = vote_pivot_centered.index)
vote_2d[["z1", "z2", "z3"]] = (u * s)[:, :3]
px.scatter(vote_2d, x='z1', y='z2', title='Vote Data', width=800, height=600, rendee_mode="svg")

In [None]:
np.round(s**2 / sum(s**2), 2)

In [None]:
fig = px.line(y=s**2 / sum(s**2), title='Variance Explained', width=700, height=400, markers=True)
fig.update_xaxes(title_text='Principal Component')
fig.update_yaxes(title_text='Proportion of Variance Explained')

In [None]:
fig = px.scatter_3d(vote_2d, x='z1', y='z2', z='z3', title='Vite Data', width=800, height=600)
fig_update_traces(marker=dict(size=5))

In [None]:
legislators_data = yaml.safe_load(open('data/legislators-2019.yaml'))

def to_date(s):
  return datetime.strptime(s, '%Y-%m-%d')

legs = pd.DataFrame(
    columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
    data=[[x['id']['bioguide'],
           x['name']['first'],
           x['name']['last'],
           x['bio']['gender'],
           x['terms'][-1]['state'],
           x['terms'][-1]['type'],
           x['terms'][-1]['party'],
           to_date(x['bio']['birthday'])] for x in legislators_data])
legs['age'] = 2024 - legs['birthday'].dt.year
legs.set_index("leg_id")
legs.sort_index()

In [None]:
vote_2d = vote_2d.join(legs.set_index('leg_id')).dropna()

In [None]:
px.scatter(vote_2d, x='z1', y='z2', color='party', symbol="gender", size='age',
           title='Vote Data', width=800, height=600, size_max=10,
           opacity=0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independant": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'],
           render_mode="svg")

In [None]:
np.random.seed(42)
vote_2d['z1_jittered'] = vote_2d['z1'] + np.random.normal(0, 0.1, len(vote_2d))
vote_2d['z2_jittered'] = vote_2d['z2'] + np.random.normal(0, 0.1, len(vote_2d))
vote_2d['z3_jittered'] = vote_2d['z3'] + np.random.normal(0, 0.1, len(vote_2d))

In [None]:
px.scatter(vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='age',
           title='Vote Data', width =800, height=600, size_max=10,
           opacity=0.7,
           color_discrete_map={'Democrat': 'blue', 'Republican': 'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'])

In [None]:
px.scatter_3d(
    vote_2d, x='z1_jittered', y='z2_jittered', z='z3_jittered',
    color='party', symbol="gender", size='age',
    title='Vote Data', width=800, height=600, size_max=10,
    opacity=0.7,
    color_discrete_map= {'Democrat':'blue', 'Republican':'red', "Independant":"green"},
    hover_data=['first', 'last', 'state', 'party', 'gender', 'age']
)

In [None]:
vote_2d["num votes"] = (
    votes[votes["vote"].isin(["Yes", "No"])]
      .groupby("member").size()
)
vote_2d.dropna(inplace=True)
vote_2d.head()

In [None]:
px.histogram(vote_2d, x="num votes", log_x=True, width=800, height=600)

In [None]:
px.scatter(vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='num votes',
           title='Vote Data (Size is Number of Votes)', width=800, height=600, size_max=10,
           opcaity= 0.7,
           color_discrete_map={'Democrat': 'blue', 'Republican':'red', "Independant": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'],
           render_mode="svg")

In [None]:
fig_eig = px.bar(x=vote_pivot_centered.columns, y=vt[0,:])
fig_eig

In [None]:
party_line_votes = (
    vote_pivot_centered.join(legs.set_index("leg_id")['party'])
    .groupby("party").mean()
    .T.reset_index()
    .rename(columns={"index":"call"})
    .melt("call")
)
fig = px.bar(
    party_line_votes,
    x="call", y = "value", facet_row="party", color="party",
    color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independant":"green"})
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

In [None]:
fig_eig

In [None]:
loadings = pd.DataFrame(
    {
        "pc1": np.sqrt(s[0]) * vt[0,:],
        "pc2": np.sqrt(s[1]) * vt[1,:]
    },
    index = vote_pivot_centered.columns)
loadings.head()

In [None]:
fig = px.scatter(
    vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='num votes',
    title='Biplot', width=800, height=600, size_max=10,
    opacity=0.7,
    color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independant":"green"},
    hover_data=['first', 'last', 'state', 'party', 'gender', 'age'],
    render_mode="svg")

for (call, pc1, pc2) in loadings.head(50).itertuples():
  fig.add_scatter(x=[0, pc1], y=[0, pc2], name=call,
                  mode='lines+markers', textposition='top right',
                  marker= dict(size=10, symbol="arrow-bar-up", angleref="previous"))
fig

In [None]:
import fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
print("Training images", train_images.shape)
print("Test images", test_images.shape)

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
class_dict = {i:class_name for i, class_name in enumerate(class_names)}

In [None]:
rng = np.random.default_rng(42)
n = 5000
sample_idx = rng.choice(np.arange(len(train_images)), size=n, replace=False)

img_mat = -1. * train_images[sample_idx]
img_mat = pd.DataFrame({"images": img_mat.tolist(),
                        "labels": train_labels[sample_idx],
                        "class": [class_dict[x] for x in train_labels[sample_idx]]})
images.head()

In [None]:
def show_images(images, ncols=5, max_images=30):
  img_rate = np.array(images.head(max_images)['images'].to_list())
  fig = px.imshow(img_mat, color_continuous_scale='gray',
                  facet_col=0, facet_col_wrap=ncols,
                  height = 220*int(np.ceil(len(images)/ncols)))
  fig.update_layout(coloraxis_showscale=False)
  fig.for_each_annotation(lambda a: a.update(text=images.iloc[int(a.split("=")[-1])]['class']))
  return fig

show_images(images.head(20))

In [None]:
show_images(images.groupby('class', as_index=False).sample(2), ncols=6)

In [None]:
X = np.array(images['images'].to_list())
X.shape

In [None]:
X = X.reshape(X.shape[0], -1)
X.shape

In [None]:
X = X - X.mean(axis=0)

In [None]:
from sklearn.decomposition import PCA
n_comps = 50
pca = PCA(n_components=n_comps)
pca.fit(X)

In [None]:
px.line(y=pca.explained_variance_ratio_ *100, markers=True)

In [None]:
images[['z1', 'z2', 'z3']] = pca.transform(X)[:,:3]

In [None]:
px.scatter(images, x='z1', y='z2', hover_data=['labels'], opacity=0.7,
           width=800, height=600, render_mode="svg")

In [None]:
px.scatter(images, x='z1', y='z2', color='class', hover_data=['labels'], opcaity=0.7,
           width=800, height=600, render_mode="svg")

In [None]:
fig = px.scatter_3d(images, x='z1', y='z2', z='z3', color='class', hover_data=['labels'],
                    width=1000, height=600)
fig.update_traces(marker=dict(size=3))