In [None]:
!pip install seaborn

import sys
!{sys.executable} -m pip install seaborn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os
from scipy.sparse import coo_matrix

In [None]:
train_data = pd.read_csv('amazon_eco-friendly_products.csv')
train_data.columns

In [None]:
train_data = train_data[['id', 'title', 'name', 'category', 'material', 'brand', 'price',
       'rating', 'reviewsCount', 'description', 'url', 'img_url', 'inStock',
       'inStockText']]
train_data.head(3)

In [None]:
train_data.shape

In [None]:
train_data.isnull().sum()

In [None]:
# Fill missing values in 'Product Rating' with a default value (e.g., 0)
train_data['name'] = train_data['name'].fillna(0)
train_data['category'] = train_data['category'].fillna(0)
train_data['material'] = train_data['material'].fillna(0)
train_data['brand'] = train_data['brand'].fillna(0)
train_data['price'] = train_data['price'].fillna(0)
train_data['rating'] = train_data['rating'].fillna(0)
train_data['reviewsCount'] = train_data['reviewsCount'].fillna(0)
train_data['description'] = train_data['description'].fillna(0)
train_data['inStock'] = train_data['inStock'].fillna(0)
train_data['inStockText'] = train_data['inStockText'].fillna(0)


In [None]:
train_data.isnull().sum()

In [None]:
# check if any dupli there 
train_data.duplicated().sum()

In [None]:
train_data = train_data.drop_duplicates()


In [None]:
train_data[train_data.duplicated()]


In [None]:
# make columns shorter
# Define the mapping of current column names to shorter names
column_name_mapping = {
    'id': 'ID',
    'title': 'Title',
    'name': 'Name',
    'category': 'Category',
    'material': 'Material',
    'brand': 'Brand',
    'price': 'Price',
    'rating': 'Rating',
    'reviewsCount': 'Reviews',
    'description': 'Description',
    'url': 'URL'
}
# Rename the columns using the mapping
train_data = train_data.rename(columns=column_name_mapping)


In [None]:
# Basic statistics
num_users = train_data['ID'].nunique()
num_items = train_data['Brand'].nunique()
num_ratings = train_data['Rating'].nunique()
print(f"Number of unique users: {num_users}")
print(f"Number of unique items: {num_items}")
print(f"Number of unique ratings: {num_ratings}")

In [None]:
# Distribution of interactions
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
train_data['ID'].value_counts().hist(bins=10, edgecolor='k')
plt.xlabel('Interactions per User')
plt.ylabel('Number of Users')
plt.title('Distribution of Interactions per User')

plt.tight_layout()
plt.show()

In [None]:
# most rated counts
train_data['Rating'].value_counts().plot(kind='bar',color='red')

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

def clean_and_extract_tags(text):
    doc = nlp(text.lower())
    tags = [token.text for token in doc if token.text.isalnum() and token.text not in STOP_WORDS]
    return ', '.join(tags)

columns_to_extract_tags_from = ['Title', 'Category', 'Materials', 'Brand', 'Description']


for column in columns_to_extract_tags_from:
    train_data[column] = train_data[column].apply(clean_and_extract_tags)

In [None]:
# Concatenate the cleaned tags from all relevant columns
train_data['Tags'] = train_data[columns_to_extract_tags_from].apply(lambda row: ', '.join(row), axis=1)