In [223]:
import pandas as pd
from currency_converter import CurrencyConverter, SINGLE_DAY_ECB_URL

# Cleaning Products Data

In [224]:
c = CurrencyConverter(SINGLE_DAY_ECB_URL)

In [225]:
products_nexus = pd.read_csv('../nexus-product.csv').drop_duplicates(subset='name')
products_wtn = pd.read_csv('../we-the-north-product.csv').drop_duplicates(subset='name')

In [226]:
products_final = pd.concat([products_nexus, products_wtn], ignore_index=True)
products_final = products_final[['name', 'category', 'price', 'currency', 'description', 'vendor', 'marketplace']]

def convert(item):
    global c
    price, currency = item
    if currency == 'USD':
        return price
    return round(c.convert(price, currency.strip(), 'USD'), 2)

products_final['price'] = products_final[['price', 'currency']].apply(convert, axis=1)
products_final.rename(columns={'price': 'price_usd'}, inplace=True)
products_final.drop('currency', axis=1, inplace=True)

products_final.to_csv('products_final.csv', index=False)


# Cleaning Vendors Data

In [227]:
vendors_nexus = pd.read_csv('../nexus-vendor.csv').drop_duplicates(subset='name')
vendors_wtn = pd.read_csv('../we-the-north-vendor.csv').drop_duplicates(subset='name')
vendors_combined = pd.concat([vendors_nexus, vendors_wtn], ignore_index=True)
vendors_combined = vendors_combined[['name', 'about_text', 'review_count', 'marketplace']]

In [228]:
vendors_combined['review_count'].fillna(0, inplace=True)

def take_last_numeric_value(value):
    if not isinstance(value, str):
        return value
    if value.isnumeric():
        return value
    else:
        return int(value.split('|')[-1])
    
vendors_combined['review_count'] = vendors_combined['review_count'].apply(take_last_numeric_value).astype('int64')

vendors_combined['review_count_normalized'] = (vendors_combined['review_count'] - vendors_combined['review_count'].min()) / (vendors_combined['review_count'].max() - vendors_combined['review_count'].min())

vendors_combined['about_text'] = vendors_combined['about_text'].apply(lambda x: '' if x == "User hasn't filled in this info" else x)

vendors_combined.to_csv('vendors_final.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  vendors_combined['review_count'].fillna(0, inplace=True)


# Cleaning Reviews Data

In [229]:
reviews_wtn = pd.read_csv('../we-the-north-review.csv')
reviews_nexus = pd.read_csv('../nexus-review.csv')

def convert_rating(x):
    val = int(x.strip()[0])
    if val < 3:
        return 'negative'
    return 'positive'

reviews_nexus['rating'] = reviews_nexus['rating'].apply(convert_rating)

reviews_combined = pd.concat([reviews_wtn, reviews_nexus], ignore_index=True)


reviews_combined['price_paid'] = reviews_combined['price_paid'].str.extract(r'(\d+)').astype('float64')
reviews_combined['price_paid_usd'] = reviews_combined[['price_paid', 'currency']].apply(convert, axis=1)
reviews_combined.drop(['currency', 'price_paid', 'product_link', 'datetime', 'author'], axis=1, inplace=True)

reviews_combined.to_csv('reviews_final.csv', index=False)