# Data Analysis

In [1]:
import requests
from requests.adapters import HTTPAdapter
import json

import pandas as pd
from datetime import date
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import time
import numpy as np
import re

The first part of the analysis is to manually categorize all the recipes. After that is done, I get the following dataframe.

In [2]:
df = pd.read_csv("data/patterns_total_categorized.csv")

df['knocker'] = np.where(df['Category'] == 'knockers', 1, 0)

In [3]:
knockers_true_false = df.groupby('knocker').agg({'project_numbers':'sum'}).reset_index().sort_values(by='project_numbers', ascending=False)
knockers_true_false

Unnamed: 0,knocker,project_numbers
1,1,4744
0,0,991


What is the most popular category?

In [4]:
# Group the recipes by category and summarize the number of completed projects using that recipe
categories = df.groupby('Category').agg({'project_numbers':'sum'}).reset_index().sort_values(by='project_numbers', ascending=False)
categories.drop(categories[categories['Category'] == 'na'].index, inplace = True)

categories = categories.head(5)

In [5]:
categories.to_csv("data/categories.csv", index=False)

Are knockers most often knitted or croched?

In [6]:
knockers = df[df['Category'] == 'knockers']
knockers_grouped = knockers.groupby('craft_type').agg({'project_numbers':'sum'}).reset_index().sort_values(by='project_numbers', ascending=False)

In [7]:
knockers_grouped['percent'] = knockers_grouped.project_numbers / knockers_grouped.project_numbers.sum() * 100
knockers_grouped = knockers_grouped.round(0)

In [8]:
knockers_grouped.to_csv("data/knockers_grouped.csv", index=False)

### Get the estimated price for a knocker

Measured by the price of the yarn needed to complete the project. Due to the unorganized structure of Ravelry, it wasn't possible to scrape this information, so the data has been hand collected. A few of the recipes did not have a recommended yarn. They are coded as missing.

In [9]:
yarn_df = pd.read_csv("data/knockers_yarn_details.csv")
yarn_df = yarn_df.replace('na',np.NaN)

# Drop the recipes where some of the values are missing - they cannot be used in this calculation.
yarn_df = yarn_df.dropna()

#Figure out if you need one or two yarn wrenches
# First change the numeric columns into the right format
yarn_df['recipe_yardage_min'] = yarn_df['recipe_yardage_min'].astype(int)
yarn_df['recipe_yardage_max'] = yarn_df['recipe_yardage_max'].astype(int)
yarn_df['price_usd'] = yarn_df['price_usd'].astype(float)
yarn_df['yarn_yards'] = yarn_df['yarn_yards'].astype(int)
yarn_df['yard_grams'] = yarn_df['yard_grams'].astype(int)


# If the result is larger than 1, you will need more than one yarn wrench for the average version of the recipe. 
yarn_df['wrench'] = (yarn_df['recipe_yardage_min'] / yarn_df['recipe_yardage_max'] * 2) / yarn_df['yarn_yards'] * 100

yarn_df.loc[yarn_df['wrench'] > 1, 'wrench'] = 2
yarn_df.loc[yarn_df['wrench'] <= 1, 'wrench'] = 1

# And finally calculate the price of a knitted knocker
yarn_df['knocker_price'] = yarn_df['price_usd'] * yarn_df['wrench']
yarn_df.head(5)

Unnamed: 0,name,url,id,craft_type,project_numbers,Category,knocker,recipe_yardage_min,recipe_yardage_max,yarn,yarn_producer,price_usd,material,yarn_yards,yard_grams,wrench,knocker_price
2,Breast Cancer Awareness Boobies by Melanie Gro...,https://www.ravelry.com/patterns/library/breas...,865422,Crochet,4,knockers,1,180,191,100% cotton double knit,MoYa,8.88,cotton,148,50,2.0,17.76
4,Breast Forms by Lisa Rode,https://www.ravelry.com/patterns/library/breas...,865397,Crochet,0,knockers,1,40,140,Simply Soft Solids,Caron,5.49,acrylic,315,170,1.0,5.49
5,Breast Prosthetic Round by Scarlett Royal,https://www.ravelry.com/patterns/library/breas...,1053583,Loom Knitting,0,knockers,1,90,110,Coboo,Lion Brand,5.99,cotton_bamboo,232,100,1.0,5.99
6,Bust Buddies by Eileen Adler,https://www.ravelry.com/patterns/library/bust-...,825548,Knitting,0,knockers,1,50,50,Ultra Pima,Cascade Yarns,13.5,pima,220,100,1.0,13.5
8,Crochet Breast by Denise Hayes,https://www.ravelry.com/patterns/library/croch...,663244,Crochet,0,knockers,1,137,164,Magic Light,Ice Yarns,7.99,acrylic,394,100,1.0,7.99


### Comparing prices
Create a new dataframe containing name and price of both the knitted knockers and conventional breast prosteses. 

First I reduce the df above, then I read in another dataset (the conventional breast prosteses) and then I merge the two dataframes.

In [10]:
knocker = yarn_df[['name','Category','knocker_price']]

# Rename the columns to match the other dataframe
knocker = knocker.rename(columns={'Category':'type','knocker_price':'price'})

In [11]:
# Load in the conventional prosthesis data
prosthesis = pd.read_csv("data/prosthesis_info.csv")
prosthesis = prosthesis.drop('retailer', axis=1)

In [12]:
# Concatenate the two dataframes
comparison = pd.concat([knocker, prosthesis], ignore_index=True, axis=0)
comparison = comparison.dropna()
comparison.to_csv("data/comparison.csv", index=False)