# Data viz ideas

## There's two main parts to this:
1) Dashboard building
2) Actual plots and such 
    - Most worn items per category 
    - Top colors worn
    - Parse data by season (winter/spring/summer/fall)

In [210]:
from dash import Dash, html, dcc, Input, Output
import altair as alt
import dash_bootstrap_components as dbc
import pandas as pd
import numpy as np
import datetime
import calmap
import sheworewhat as sww

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [2]:
closet = sww.closet_df()

closet

Unnamed: 0,ID,Item,Category,Sub-Category,Color,Pattern,Brand,Bought,Cost,2023,Price,PrimaryC,Name
0,0,Turtleneck,Top,Sweater,Black,Plain,Zara,"Secondhand, Thrifted",cheap,No,,Black,0 Zara Turtleneck - Black
1,1,Tank,Top,Tanktop,"Black, Red, Gold",Feather,Plisse,"Secondhand, Thrifted",cheap,No,,Black,1 Plisse Tank - Black
2,2,Tank,Top,Tanktop,"Black, Tan",Leopard,Plisse,"Secondhand, Thrifted",cheap,No,15.0,Black,2 Plisse Tank - Black
3,3,Jeans,Bottom,Pants,Blue,Plain,Aerie,New,cheap,No,,Blue,3 Aerie Jeans - Blue
4,4,Shirt,Top,Shirt,"Black, White",Cheetah,Free People,"Secondhand, Depop",cheap,No,,Black,4 Free People Shirt - Black
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,82,Christmas Tree Hoops,Accessory,Jewelry,Gold,Plain,No Brand,"Secondhand, Thrifted",cheap,No,,Gold,82 No Brand Christmas Tree Hoops - Gold
83,83,Square Hoops,Accessory,Jewelry,Gold,Plain,Tj Maxx,New,cheap,No,,Gold,83 Tj Maxx Square Hoops - Gold
84,84,Puffer,Outerwear,Coat,Green,Plain,Hollister,New,pricy,No,,Green,84 Hollister Puffer - Green
85,85,Tote Bag,Accessory,Bag,Green,Logo,Ubc,New,cheap,No,,Green,85 Ubc Tote Bag - Green


In [3]:
acc_df, bottom_df, fb_df, out_df, shoes_df, top_df = sww.closet_cat(closet)

acc_df.tail(5)

Unnamed: 0,ID,Item,Category,Sub-Category,Color,Pattern,Brand,Bought,Cost,2023,Price,PrimaryC,Name
81,81,Gold Hoops,Accessory,Jewelry,Gold,Plain,No Brand,"Secondhand, Gifted",pricy,No,,Gold,81 No Brand Gold Hoops - Gold
82,82,Christmas Tree Hoops,Accessory,Jewelry,Gold,Plain,No Brand,"Secondhand, Thrifted",cheap,No,,Gold,82 No Brand Christmas Tree Hoops - Gold
83,83,Square Hoops,Accessory,Jewelry,Gold,Plain,Tj Maxx,New,cheap,No,,Gold,83 Tj Maxx Square Hoops - Gold
85,85,Tote Bag,Accessory,Bag,Green,Logo,Ubc,New,cheap,No,,Green,85 Ubc Tote Bag - Green
86,86,Purse,Accessory,Bag,White,Vintage,No Brand,"Secondhand, Depop",cheap,No,,White,86 No Brand Purse - White


## Closet EDA
- What percentage of my closet is new vs. secondhand?

In [4]:
closet_count = closet.groupby(by="Bought").count()
closet_count

Unnamed: 0_level_0,ID,Item,Category,Sub-Category,Color,Pattern,Brand,Cost,2023,Price,PrimaryC,Name
Bought,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
New,43,43,43,43,43,43,43,43,43,0,43,43
"Secondhand, Depop",2,2,2,2,2,2,2,2,2,0,2,2
"Secondhand, Gifted",5,5,5,5,5,5,5,5,5,0,5,5
"Secondhand, Thrifted",37,37,37,37,37,37,37,37,37,3,37,37


In [5]:
closet_n = closet.groupby(["Bought"])[["Bought"]].agg('count').rename(columns={"Bought" : "N"}).reset_index()
closet_n["Percent"] = (closet_n['N'] / len(closet))
closet_n

Unnamed: 0,Bought,N,Percent
0,New,43,0.494253
1,"Secondhand, Depop",2,0.022989
2,"Secondhand, Gifted",5,0.057471
3,"Secondhand, Thrifted",37,0.425287


Should I do just secondhand? then breakdown the secondhand

In [6]:
closet_n = closet.groupby(["Bought"])[["Bought"]].agg('count').rename(columns={"Bought" : "N"}).reset_index()
closet_n["Percent"] = (closet_n['N'] / len(closet))

closet_n["Status"] = closet_n["Bought"].str.split(",").str[1]
closet_n = closet_n.replace(np.nan, "New")

closet_comp = alt.Chart(closet_n, title="Closet Composition"
                       ).mark_bar(color="Maroon"
                         ).encode(alt.X("Bought", axis=alt.Axis(labelAngle=-45), sort='-y'),
                                  alt.Y("Percent", axis=alt.Axis(format='%'),),
                                  alt.Tooltip("Percent", format=",.2f"),
                                  color=alt.condition(
                                    alt.datum.year == "New",
                                    alt.value('orange'),
                                    alt.value('maroon')   
                                    )
                                 )
                
closet_comp

In [7]:
closet_n = closet.groupby(["Bought"])[["Bought"]].agg('count').rename(columns={"Bought" : "N"}).reset_index()
closet_n["Percent"] = (closet_n['N'] / len(closet))

closet_n["Purchased"] = closet_n["Bought"].str.split(",").str[0]
closet_n["Status"] = closet_n["Bought"].str.split(",").str[1]
closet_n = closet_n.replace(np.nan, "New")

closet_n

Unnamed: 0,Bought,N,Percent,Purchased,Status
0,New,43,0.494253,New,New
1,"Secondhand, Depop",2,0.022989,Secondhand,Depop
2,"Secondhand, Gifted",5,0.057471,Secondhand,Gifted
3,"Secondhand, Thrifted",37,0.425287,Secondhand,Thrifted


In [8]:
closet_n.groupby(["Purchased"]).count()

Unnamed: 0_level_0,Bought,N,Percent,Status
Purchased,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
New,1,1,1,1
Secondhand,3,3,3,3


In [9]:
# new vs secondhand only plot
closet_comp = alt.Chart(closet_n, title="Closet Composition"
                       ).mark_bar(color="Maroon"
                         ).encode(alt.X("Bought", axis=alt.Axis(labelAngle=-45), sort='-y'),
                                  alt.Y("Percent", axis=alt.Axis(format='%'),),
                                  alt.Tooltip("Percent", format=",.2f"),
                                  color=alt.condition(
                                    alt.datum.year == "New",
                                    alt.value('orange'),
                                    alt.value('maroon')   
                                    )
                                 )
                
closet_comp

## clean and merge data
- need function to clean, merge, do counts and stuff for collected data

In [10]:
path = "../data/2023TestData.csv"

df = pd.read_csv(path)
df

Unnamed: 0,Timestamp,Date,Accessory,Shoes,Outerwear,Full Body,Bottom,Top,Top2,Accessory2,Accessory3,Accessory4,Note
0,12/21/2022 20:27:23,12/19/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,24 Aerie Leggings - Green,16 Aerie Sports Bra - Green,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
1,12/21/2022 20:28:00,12/20/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,24 Aerie Leggings - Green,16 Aerie Sports Bra - Green,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
2,12/21/2022 20:28:30,12/21/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,26 Aerie Leggings - Black,17 Aerie Sports Bra - Black,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
3,12/21/2022 20:30:40,12/22/2022,85 Ubc Tote Bag - Green,9 Blondo Boots - Black,84 Hollister Puffer - Green,,3 Aerie Jeans - Blue,2 Plisse Tank - Black,,81 No Brand Gold Hoops - Gold,,,
4,12/21/2022 20:33:50,12/23/2022,82 No Brand Christmas Tree Hoops - Gold,9 Blondo Boots - Black,84 Hollister Puffer - Green,57 American Eagle Maxi Dress - Green,,6 Express Body Suit - White,46 No Brand Vest - White,,,,


We don't need the timestamp, its redundant and easier to just use date

In [11]:
df.iloc[:, 1:]

Unnamed: 0,Date,Accessory,Shoes,Outerwear,Full Body,Bottom,Top,Top2,Accessory2,Accessory3,Accessory4,Note
0,12/19/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,24 Aerie Leggings - Green,16 Aerie Sports Bra - Green,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
1,12/20/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,24 Aerie Leggings - Green,16 Aerie Sports Bra - Green,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
2,12/21/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,26 Aerie Leggings - Black,17 Aerie Sports Bra - Black,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
3,12/22/2022,85 Ubc Tote Bag - Green,9 Blondo Boots - Black,84 Hollister Puffer - Green,,3 Aerie Jeans - Blue,2 Plisse Tank - Black,,81 No Brand Gold Hoops - Gold,,,
4,12/23/2022,82 No Brand Christmas Tree Hoops - Gold,9 Blondo Boots - Black,84 Hollister Puffer - Green,57 American Eagle Maxi Dress - Green,,6 Express Body Suit - White,46 No Brand Vest - White,,,,


In [12]:
df.drop("Timestamp", axis=1)

Unnamed: 0,Date,Accessory,Shoes,Outerwear,Full Body,Bottom,Top,Top2,Accessory2,Accessory3,Accessory4,Note
0,12/19/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,24 Aerie Leggings - Green,16 Aerie Sports Bra - Green,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
1,12/20/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,24 Aerie Leggings - Green,16 Aerie Sports Bra - Green,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
2,12/21/2022,10 Athleta Beanie - Tan,75 Adidas Tennis Shoe - Black,84 Hollister Puffer - Green,,26 Aerie Leggings - Black,17 Aerie Sports Bra - Black,50 Glossier Crewneck Sweater - Gray,85 Ubc Tote Bag - Green,,,
3,12/22/2022,85 Ubc Tote Bag - Green,9 Blondo Boots - Black,84 Hollister Puffer - Green,,3 Aerie Jeans - Blue,2 Plisse Tank - Black,,81 No Brand Gold Hoops - Gold,,,
4,12/23/2022,82 No Brand Christmas Tree Hoops - Gold,9 Blondo Boots - Black,84 Hollister Puffer - Green,57 American Eagle Maxi Dress - Green,,6 Express Body Suit - White,46 No Brand Vest - White,,,,


two things: would be useful to have counts and also the dates in a list per item

Need to be able to handle multiple items in a single category (layering, etc)
- doesnt make sense to edit google sheet because layering can happen with all categories

In [13]:
path = "../data/2023TestData.csv"

df = pd.read_csv(path).drop("Timestamp", axis=1).melt("Date").dropna()

df

Unnamed: 0,Date,variable,value
0,12/19/2022,Accessory,10 Athleta Beanie - Tan
1,12/20/2022,Accessory,10 Athleta Beanie - Tan
2,12/21/2022,Accessory,10 Athleta Beanie - Tan
3,12/22/2022,Accessory,85 Ubc Tote Bag - Green
4,12/23/2022,Accessory,82 No Brand Christmas Tree Hoops - Gold
5,12/19/2022,Shoes,75 Adidas Tennis Shoe - Black
6,12/20/2022,Shoes,75 Adidas Tennis Shoe - Black
7,12/21/2022,Shoes,75 Adidas Tennis Shoe - Black
8,12/22/2022,Shoes,9 Blondo Boots - Black
9,12/23/2022,Shoes,9 Blondo Boots - Black


In [14]:
df.groupby("value").count()

Unnamed: 0_level_0,Date,variable
value,Unnamed: 1_level_1,Unnamed: 2_level_1
10 Athleta Beanie - Tan,3,3
16 Aerie Sports Bra - Green,2,2
17 Aerie Sports Bra - Black,1,1
2 Plisse Tank - Black,1,1
24 Aerie Leggings - Green,2,2
26 Aerie Leggings - Black,1,1
3 Aerie Jeans - Blue,1,1
46 No Brand Vest - White,1,1
50 Glossier Crewneck Sweater - Gray,3,3
57 American Eagle Maxi Dress - Green,1,1


In [15]:
df["test0"] = df["value"].str.split(",").str[0]
df["test1"] = df["value"].str.split(",").str[1]
df.head(3)

Unnamed: 0,Date,variable,value,test0,test1
0,12/19/2022,Accessory,10 Athleta Beanie - Tan,10 Athleta Beanie - Tan,
1,12/20/2022,Accessory,10 Athleta Beanie - Tan,10 Athleta Beanie - Tan,
2,12/21/2022,Accessory,10 Athleta Beanie - Tan,10 Athleta Beanie - Tan,


ugh maybe this can be reformatted somehow for easier parsing; also i dont love how the format is now

OR 
maybe if column contains more than 1 number, split it i

IDEA:
- GOOGLE SHEET FORM: ID ITEM - COLOR
- then multiple selections can be parse via "," in the sheet

In [16]:
# def closet
path = "../data/2023TestData.csv"

df = pd.read_csv(path).drop("Timestamp", axis=1).melt("Date").dropna()

# extract ID number from value
df["ID"] = df.value.str.extract('(\d+)').astype(int)

closet_counts = df.groupby(["value", "ID"]).count().reset_index().rename(columns={"Date":"count"}).drop(["variable"],axis=1)

closet_counts

# left join closet + df
x = pd.merge(closet, closet_counts, how="left", on="ID")
x

Unnamed: 0,ID,Item,Category,Sub-Category,Color,Pattern,Brand,Bought,Cost,2023,Price,PrimaryC,Name,value,count
0,0,Turtleneck,Top,Sweater,Black,Plain,Zara,"Secondhand, Thrifted",cheap,No,,Black,0 Zara Turtleneck - Black,,
1,1,Tank,Top,Tanktop,"Black, Red, Gold",Feather,Plisse,"Secondhand, Thrifted",cheap,No,,Black,1 Plisse Tank - Black,,
2,2,Tank,Top,Tanktop,"Black, Tan",Leopard,Plisse,"Secondhand, Thrifted",cheap,No,15.0,Black,2 Plisse Tank - Black,2 Plisse Tank - Black,1.0
3,3,Jeans,Bottom,Pants,Blue,Plain,Aerie,New,cheap,No,,Blue,3 Aerie Jeans - Blue,3 Aerie Jeans - Blue,1.0
4,4,Shirt,Top,Shirt,"Black, White",Cheetah,Free People,"Secondhand, Depop",cheap,No,,Black,4 Free People Shirt - Black,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,82,Christmas Tree Hoops,Accessory,Jewelry,Gold,Plain,No Brand,"Secondhand, Thrifted",cheap,No,,Gold,82 No Brand Christmas Tree Hoops - Gold,82 No Brand Christmas Tree Hoops - Gold,1.0
83,83,Square Hoops,Accessory,Jewelry,Gold,Plain,Tj Maxx,New,cheap,No,,Gold,83 Tj Maxx Square Hoops - Gold,,
84,84,Puffer,Outerwear,Coat,Green,Plain,Hollister,New,pricy,No,,Green,84 Hollister Puffer - Green,84 Hollister Puffer - Green,5.0
85,85,Tote Bag,Accessory,Bag,Green,Logo,Ubc,New,cheap,No,,Green,85 Ubc Tote Bag - Green,85 Ubc Tote Bag - Green,4.0


In [17]:
x = pd.merge(closet, df, on="ID")


In [18]:
closet_counts = df.groupby(["value"]).count().reset_index().rename(columns={"Date":"count"}).drop(["ID", "variable"],axis=1)
closet_counts

Unnamed: 0,value,count
0,10 Athleta Beanie - Tan,3
1,16 Aerie Sports Bra - Green,2
2,17 Aerie Sports Bra - Black,1
3,2 Plisse Tank - Black,1
4,24 Aerie Leggings - Green,2
5,26 Aerie Leggings - Black,1
6,3 Aerie Jeans - Blue,1
7,46 No Brand Vest - White,1
8,50 Glossier Crewneck Sweater - Gray,3
9,57 American Eagle Maxi Dress - Green,1


In [19]:
closet_comp = alt.Chart(closet_counts, title="2023 Most Worn Pieces"
                       ).mark_bar(color="Maroon"
                         ).encode(alt.X("value", axis=alt.Axis(labelAngle=-45), sort='-y'),
                                  alt.Y("count", axis=alt.Axis(format='%'),),
                                  alt.Tooltip("value", format=",.2f"),
                                  color=alt.condition(
                                    alt.datum.year == "New",
                                    alt.value('orange'),
                                    alt.value('maroon')   
                                    )
                                 )
                
closet_comp

## its a mess up there but this is the function
1. takes in google form data
2. counts how many of each item logged
3. left joins with closet data

In [20]:
def complete_df(closet, path="../data/2023TestData.csv"):
    """
    Function to merge raw closet data and collected 2023 data.
    
    Parameters
    ----------
        closet : pandas.DataFrame
            Dataframe containing complete closet log.
        path : string
            String containing path of CSV of collected data.
    
    Returns
    -------
        complete_df : pandas.DataFrame
            Dataframe containing "ID", "Name", "count", "Item", 
            "Category", "Sub-Category", "Color", "Pattern", "Brand", "Cost", "2023"
    """


    form = pd.read_csv(path).drop("Timestamp", axis=1).melt("Date").dropna()

    # extract ID number from value
    form["ID"] = form.value.str.extract('(\d+)').astype(int)

    form_counts = form.groupby(["value", "ID"]).count().reset_index().rename(columns={"Date":"count"}).drop(["variable"],axis=1)
    
    # left join closet + df
    complete_df = pd.merge(closet, form_counts, how="left", on="ID")
    complete_df["Name"] = complete_df["Brand"] + " " + complete_df["Item"]
    complete_df = complete_df[["ID", "Name", "count", "Item", "Category", "Sub-Category", "Color", "Pattern", "Brand", "Cost", "2023"]]
    complete_df = complete_df.fillna(0).rename(columns={"count" : "Count"})
    complete_df["Count"] = complete_df["Count"].astype(int)
    
    return complete_df


In [54]:
closet = sww.closet_df()
worn_df = complete_df(closet)
worn_df

Unnamed: 0,ID,Name,Count,Item,Category,Sub-Category,Color,Pattern,Brand,Cost,2023
0,0,Zara Turtleneck,0,Turtleneck,Top,Sweater,Black,Plain,Zara,cheap,No
1,1,Plisse Tank,0,Tank,Top,Tanktop,"Black, Red, Gold",Feather,Plisse,cheap,No
2,2,Plisse Tank,1,Tank,Top,Tanktop,"Black, Tan",Leopard,Plisse,cheap,No
3,3,Aerie Jeans,1,Jeans,Bottom,Pants,Blue,Plain,Aerie,cheap,No
4,4,Free People Shirt,0,Shirt,Top,Shirt,"Black, White",Cheetah,Free People,cheap,No
...,...,...,...,...,...,...,...,...,...,...,...
83,83,Tj Maxx Square Hoops,0,Square Hoops,Accessory,Jewelry,Gold,Plain,Tj Maxx,cheap,No
84,84,Hollister Puffer,5,Puffer,Outerwear,Coat,Green,Plain,Hollister,pricy,No
85,85,Ubc Tote Bag,4,Tote Bag,Accessory,Bag,Green,Logo,Ubc,cheap,No
86,86,No Brand Purse,0,Purse,Accessory,Bag,White,Vintage,No Brand,cheap,No


In [22]:
worn_df["Count"].astype(int)

0     0
1     0
2     1
3     1
4     0
     ..
82    1
83    0
84    5
85    4
86    0
Name: Count, Length: 87, dtype: int64

## Some data viz

### 2023 Most Worn Pieces

In [23]:
most_worn = worn_df.nlargest(15, columns="Count")
closet_comp = alt.Chart(most_worn, title="2023 Most Worn Pieces"
                       ).mark_bar(color="Maroon"
                         ).encode(alt.X("Name", axis=alt.Axis(labelAngle=-45), sort="-y"),
                                  alt.Y("Count", 
                                        title="# of Times Worn",
                                        axis=alt.Axis(tickMinStep=1)),
                                  alt.Tooltip("Count")
                                 )
                
closet_comp

### Make similar plots but facet per category!

The problem with the below code is the x-axis is the same and we do not want that. We want the x axis to contain unique items for that category!
- Solution: generate 6 separate plots and concat

In [24]:
category_worn = worn_df.nlargest(15, columns="Count")
closet_comp = alt.Chart(category_worn, title="2023 Most Worn Pieces"
                       ).mark_bar(color="Maroon"
                       ).encode(alt.X("Name", axis=alt.Axis(labelAngle=-45), sort="-y"),
                                alt.Y("Count", 
                                title="# of Times Worn",
                                axis=alt.Axis(tickMinStep=1)),
                                alt.Tooltip("Count")
                        ).facet("Category", columns=3)
                
closet_comp

In [25]:
worn_df["Category"]

0           Top
1           Top
2           Top
3        Bottom
4           Top
        ...    
82    Accessory
83    Accessory
84    Outerwear
85    Accessory
86    Accessory
Name: Category, Length: 87, dtype: object

**Solution**: Concat six graphs because the x and y axes will inherently very different

*whyyyyy won't sort="y" work pls*

In [47]:
categories = ["Top", "Bottom", "Full Body", "Outerwear", "Accessory", "Shoes"]

cat_plots = []

for i in categories:
    category_worn = worn_df.loc[worn_df["Category"] == i].nlargest(15, columns="Count")
    
    category_plot = alt.Chart(category_worn, title=f"2023 Most Worn {i}"
                       ).mark_bar(color="#B79492"
                       ).encode(alt.X("Name", title="", axis=alt.Axis(labelAngle=-45), sort="-y"),
                                alt.Y("Count", 
                                title="# of Times Worn",
                                axis=alt.Axis(tickMinStep=1)),
                                alt.Tooltip(["Name", "Count"])
                        ).resolve_scale(x='independent')
    cat_plots.append(category_plot)

# configure altair charts
row1 = alt.hconcat(cat_plots[0], cat_plots[1], cat_plots[2])
row2 = alt.hconcat(cat_plots[3], cat_plots[4], cat_plots[5])

category_plot = alt.vconcat(row1, row2)
category_plot

Troubleshooting this gd error

In [41]:
category_worn = worn_df.loc[worn_df["Category"] == "Top"].nlargest(10, columns="Count")
category_worn["Count"]

50    3
16    2
2     1
6     1
17    1
46    1
0     0
1     0
4     0
5     0
Name: Count, dtype: int64

In [42]:
category_worn.head(5)

Unnamed: 0,ID,Name,Count,Item,Category,Sub-Category,Color,Pattern,Brand,Cost,2023
50,50,Glossier Crewneck Sweater,3,Crewneck Sweater,Top,Sweater,Gray,Logo,Glossier,cheap,No
16,16,Aerie Sports Bra,2,Sports Bra,Top,Workout,Green,Plain,Aerie,Cheap,No
2,2,Plisse Tank,1,Tank,Top,Tanktop,"Black, Tan",Leopard,Plisse,cheap,No
6,6,Express Body Suit,1,Body Suit,Top,Shirt,White,Plain,Express,cheap,No
17,17,Aerie Sports Bra,1,Sports Bra,Top,Workout,Black,Plain,Aerie,Cheap,No


Weirdly the solution seems to be reduce the dataframe I'm pulling from?
`.nlargest(15, columns="Count")` to `nlargest.(5, columns="Count")` fixed it in the concat'd plot

In [43]:
alt.Chart(category_worn, title=f"2023 Most Worn Top"
                   ).mark_bar(color="Pink"
                   ).encode(alt.X("Name", title="", axis=alt.Axis(labelAngle=-45), sort="-y"),
                            alt.Y("Count", 
                            title="# of Times Worn",
                            axis=alt.Axis(tickMinStep=1)),
                            alt.Tooltip(["Name", "Count"])
                    ).resolve_scale(x='independent')

In [50]:
categories = ["Top", "Bottom", "Full Body", "Outerwear", "Accessory", "Shoes"]

cat_plots = []

for i in categories:
    category_worn = worn_df.loc[worn_df["Category"] == i].nlargest(5, columns="Count")
    
    category_plot = alt.Chart(category_worn, title=f"2023 Most Worn {i}"
                       ).mark_bar(color="#B40490"
                       ).encode(alt.X("Name", title="", axis=alt.Axis(labelAngle=-45), sort="-y"),
                                alt.Y("Count", 
                                title="# of Times Worn",
                                axis=alt.Axis(tickMinStep=1)),
                                alt.Tooltip(["Name", "Count"])
                        ).resolve_scale(x='independent')
    cat_plots.append(category_plot)

# configure altair charts
row1 = alt.hconcat(cat_plots[0], cat_plots[1], cat_plots[2])
row2 = alt.hconcat(cat_plots[3], cat_plots[4], cat_plots[5])

category_plot = alt.vconcat(row1, row2)
category_plot

## Heatmap of plots

In [85]:
path = "../data/2023TestData.csv"

df = pd.read_csv(path).drop("Timestamp", axis=1).melt("Date").dropna()
df["Date"] = pd.to_datetime(df["Date"])

# extract ID number from value
df["ID"] = df.value.str.extract('(\d+)').astype(int)
df = pd.merge(closet, df, how="right", on="ID")
df = df[["ID", "Item", "Color", "Pattern", "Category", "Date"]]
df.sample(4)

Unnamed: 0,ID,Item,Color,Pattern,Category,Date
4,82,Christmas Tree Hoops,Gold,Plain,Accessory,2022-12-23
17,24,Leggings,Green,Plain,Bottom,2022-12-20
11,84,Puffer,Green,Plain,Outerwear,2022-12-20
3,85,Tote Bag,Green,Logo,Accessory,2022-12-22


Maybe once we have the top 10 items we can heatmap them?

In [86]:
top_10 = [10, 85, 81]
heatmap_data = df.loc[df["ID"] == top_10[1]]
heatmap_data

Unnamed: 0,ID,Item,Color,Pattern,Category,Date
3,85,Tote Bag,Green,Logo,Accessory,2022-12-22
29,85,Tote Bag,Green,Logo,Accessory,2022-12-19
30,85,Tote Bag,Green,Logo,Accessory,2022-12-20
31,85,Tote Bag,Green,Logo,Accessory,2022-12-21


Ok idea is:
- make a calender df
- full merge the above df
    - replace NA item with ..?
    - or make a boolean column that will be colored if yes? (0 1) 
- plot month on x and day on y axis 

In [205]:
time_df = pd.DataFrame()
time_df["Date"] = pd.date_range('2022-12-01', periods=30)
time_df["Day"] = time_df["Date"].dt.day_name()


In [206]:
year = pd.merge(time_df, heatmap_data, how="outer", on="Date")
year["Item"] = year["Item"].replace(np.nan, 0) 
year["Bool"] = np.where(year["Item"] == 0, 0, 1)

In [207]:
year[15:22]

Unnamed: 0,Date,Day,ID,Item,Color,Pattern,Category,Bool
15,2022-12-16,Friday,,0,,,,0
16,2022-12-17,Saturday,,0,,,,0
17,2022-12-18,Sunday,,0,,,,0
18,2022-12-19,Monday,85.0,Tote Bag,Green,Logo,Accessory,1
19,2022-12-20,Tuesday,85.0,Tote Bag,Green,Logo,Accessory,1
20,2022-12-21,Wednesday,85.0,Tote Bag,Green,Logo,Accessory,1
21,2022-12-22,Thursday,85.0,Tote Bag,Green,Logo,Accessory,1


This is the ugliest plot I have seen in my life

In [212]:
alt.Chart(year).mark_rect().encode(
    x="Date:O", 
    y="Day",
    color="Bool"
)

I wonder if it would be a better viz to see what months it was worn the most intead?
- bc yeah otherwise big x axis 

package: calmap

In [219]:
year.set_index("Date")

Unnamed: 0_level_0,Day,ID,Item,Color,Pattern,Category,Bool
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-12-01,Thursday,,0,,,,0
2022-12-02,Friday,,0,,,,0
2022-12-03,Saturday,,0,,,,0
2022-12-04,Sunday,,0,,,,0
2022-12-05,Monday,,0,,,,0
2022-12-06,Tuesday,,0,,,,0
2022-12-07,Wednesday,,0,,,,0
2022-12-08,Thursday,,0,,,,0
2022-12-09,Friday,,0,,,,0
2022-12-10,Saturday,,0,,,,0


In [221]:
calmap.yearplot(year["Bool"], cmap='YlGn', fillcolor='lightgrey',daylabels='MTWTFSS',dayticks=[0, 2, 4, 6],
                linewidth=2)

AttributeError: 'numpy.int64' object has no attribute 'year'

TO DO NEXT:
- color scheme of plots 
- figure out why tf the x-axis seems to be sorting alphabetically first

IDEA: 
- heatmap of year and most worn item (this is where image could pop up)
    - omg so fun im excited
- is there anyway to read google sheet data in automatically? its a pain to download everytime for testing

# Note to self 
-- maybe add ID into google sheet for easier merging later?