In [103]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

columns_names = ["title", "price", "currency", "total sold", "condition", "color", "theme", "features", "url"]
df = pd.read_csv('output.csv', names = columns_names)
df = df.append(pd.read_csv('output2.csv', names=columns_names))
df = df.append(pd.read_csv('biggerDataset_-_output2.csv', names=columns_names))
df = df.drop_duplicates(subset=['url'])

def price_discretization(frame):
    df = frame.copy()
    df.dropna(subset=['price'], inplace=True)
    df['price'] = df.price.str.replace(',|\.|\$|\/ea', '', regex=True).astype(int)
    return df

# all current listings are sold => at least one shirt sold
def clean_total_sold(frame):
    df = frame.copy()
    df['total sold'].fillna('1', inplace=True)
    df['total sold'] = df['total sold'].str.replace(',', '', regex=True).astype(int)
    df['total sold'].replace(0, 1, inplace=True)
    return df

# TODO: fix scraper-level issues on subsequent passes
def clean_scraper_issues(frame):
    df = frame.copy()
    # associated listing URL had a nonzero closing price
    df = df[frame.price > 0]
    # scraper only looks at 'Item Specifics' for conidition and nowhere else
    df.dropna(subset=['condition'], inplace=True)
    return df

def normalize_color(frame):
    df = frame.copy()
    # listing URLs with NaN color had multiple colors available
    df['color'] = df['color'].fillna('Multiple')
    df['color'] = df['color'].replace([
        'Black, Ice Grey, Red, or White',
        'White,Black',
        'assorted colors',
        'Retro Gray, Black',
        'Ash Grey,Pink, White,Sand, Lt.Blue',
        'all'
    ], 'Multiple')
    df['color'] = df['color'].replace([
        'Black, Gold',
        'Black, White',
        'As in Picture',
        'As picture',
        'Tie Dye'
    ], 'Multicolor')
    df['color'] = df['color'].str.lower()
    
    df['color'] = df['color'].str.replace('multiple', '3', regex=True)
    neutral_colors = ['black', 'white', 'beige', 'beiges', 'gray', 'grey']
    for neutral in neutral_colors:
        df['color'] = df['color'].str.replace(neutral, '2', regex=True)
    df['color'] = df['color'].where((df['color'] == '2') | (df['color'] == '3'), '1')
    df['color'] = df['color'].astype(int)
    return df

def normalize_condition(frame):
    df = frame.copy()
    df['condition'] = df['condition'].str.replace('^New with tags.+', '3', regex=True)
    df['condition'] = df['condition'].str.replace('^New without tags.+', '2', regex=True)
    df['condition'] = df['condition'].str.replace('^Pre-owned.+', '1', regex=True)
    df['condition'] = df['condition'].astype(int)
    return df

def check_tags(frame):
    tag_frame = frame.iloc[:,[6,7]]
    tag_frame = tag_frame.fillna(value='')
    tag_frame['tags'] = tag_frame['theme'].str.split(', ') + tag_frame['features'].str.split(', ')
    tag_frame['tags'] = [[i for i in list_ if i not in ['']] for list_ in tag_frame['tags']]
    tag_flat_list = np.concatenate(tag_frame['tags'].to_numpy())
    tag_frequency = pd.value_counts(tag_flat_list)

    most_frequent_tags = tag_frequency[tag_frequency >= 20]
    tag_chart = most_frequent_tags.plot.pie(label='Count')
    print(most_frequent_tags)

def binarize_tags(frame):
    df = frame.copy()
    
    tag_frame = df.iloc[:,[6,7]]
    tag_frame = tag_frame.fillna(value='')
    tag_frame['tags'] = tag_frame['theme'].str.split(', ') + tag_frame['features'].str.split(', ')
    tag_frame['tags'] = [[i for i in list_ if i not in ['']] for list_ in tag_frame['tags']]
    
    mlb = MultiLabelBinarizer()
    tag_matrix = pd.DataFrame(mlb.fit_transform(tag_frame['tags']),columns=mlb.classes_)
    df = df.drop(['theme', 'features'], axis=1).join(tag_matrix)

    return df

df = price_discretization(df)
df = clean_total_sold(df)
df = clean_scraper_issues(df)
df = normalize_color(df)
df = normalize_condition(df)
df = df.reset_index(drop=True)
df = binarize_tags(df)
df = df.drop(['title', 'currency', 'url'], axis=1)

df.to_csv('cleaned_all_tags_12-10-2021.csv')