In [None]:
# Import necessary libraries
import pandas as pd

# Load dataset
df = pd.read_csv('ecommerce_furniture_dataset.csv')

# View the first few rows of the dataset
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Dropping any rows with missing values (if applicable)
df = df.dropna()

# Converting tagText into a categorical feature (if necessary)
df['tagText'] = df['tagText'].astype('category').cat.codes

# Checking for data types and conversions if necessary
print(df.info())

import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of 'sold' values
sns.histplot(df['sold'], kde=True)
plt.title('Distribution of Furniture Items Sold')
plt.show()

# Plot the relationship between originalPrice, price and sold
sns.pairplot(df, vars=['originalPrice', 'price', 'sold'],
kind='scatter')
plt.title('Relationship Between Price, Original Price, and
Items Sold')
plt.show()

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a new feature: percentage discount
df['discount_percentage'] = ((df['originalPrice'] -
df['price']) / df['originalPrice']) * 100

# Convert productTitle into a numeric feature using TF-IDF
Vectorizer
tfidf = TfidfVectorizer(max_features=100)
productTitle_tfidf = tfidf.fit_transform(df['productTitle'])

# Convert to DataFrame and concatenate to original df
productTitle_tfidf_df =
pd.DataFrame(productTitle_tfidf.toarray(),
columns=tfidf.get_feature_names_out())
df = pd.concat([df, productTitle_tfidf_df], axis=1)

# Drop original productTitle as it's now encoded
df = df.drop('productTitle', axis=1)

(80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

# Initialize models
lr_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=100,
random_state=42)

# Train models
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Predict with Linear Regression
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Predict with Random Forest
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print model evaluation results
print(f'Linear Regression MSE: {mse_lr}, R2: {r2_lr}')
print(f'Random Forest MSE: {mse_rf}, R2: {r2_rf}')