In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data
df = pd.read_csv("./datasetb2d9982/dataset/train.csv")

In [3]:
# Print the number of missing values in each column
print(df.isnull().sum())

PRODUCT_ID               0
TITLE                   13
BULLET_POINTS       837366
DESCRIPTION        1157382
PRODUCT_TYPE_ID          0
PRODUCT_LENGTH           0
dtype: int64


In [4]:
# Drop any rows with missing values (if mode values cause instability, drop all missin values nly for test)
# df.dropna(inplace=True)

In [5]:
# Impute missing values in the TITLE column with the mode
df['TITLE'].fillna(df['TITLE'].mode()[0], inplace=True)

In [6]:
# Drop rows with missing values in the BULLET_POINTS and DESCRIPTION columns
df.dropna(subset=['BULLET_POINTS', 'DESCRIPTION'], inplace=True)

In [7]:
# Verifying missing values
print(df.isnull().sum())

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64


In [8]:
# Visualizing the df
# refer to Colbaoratory for visualizing the df --> https://colab.research.google.com/drive/1szNvND0e7GlUPA98WiQYwcUo2VZ9De9C?usp=sharing

In [9]:
# Preprocess the text data
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/lakshaynasa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lakshaynasa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Function to preprocess the text data
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the words
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = " ".join(stemmed_tokens)
    
    return preprocessed_text

In [None]:
# Preprocessing the text data in the TITLE, DESCRIPTION, and BULLET_POINTS columns
df["TITLE"] = df["TITLE"].apply(preprocess_text)
df["DESCRIPTION"] = df["DESCRIPTION"].apply(preprocess_text)
df["BULLET_POINTS"] = df["BULLET_POINTS"].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
# Define the feature columns
text_columns = ["TITLE", "DESCRIPTION", "BULLET_POINTS"]
categorical_columns = ["PRODUCT_TYPE_ID"]

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(df[text_columns].apply(lambda x: " ".join(x), axis=1))

In [None]:
# One-hot encode the categorical features
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(df[categorical_columns])

In [None]:
# Combine the features
feature_names = vectorizer.get_feature_names() + encoder.get_feature_names(categorical_columns)
X = np.concatenate((text_features.toarray(), categorical_features.toarray()), axis=1)
y = df["PRODUCT_LENGTH"].values

In [None]:
# Define a column transformer to apply the same preprocessing to new data
preprocessor = ColumnTransformer([
    ("text", vectorizer, text_columns),
    ("cat", encoder, categorical_columns)
])

In [None]:
Evaluate the performance of different models using cross-validation
models = [
    LinearRegression(),
    Lasso(),
    Ridge(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor()
]

In [None]:
for model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_percentage_error")
    print(type(model).__name__)
    print("Mean absolute percentage error: {:.2f}%".format(-100 * np.mean(scores)))
    print()

In [None]:
best_model = GradientBoostingRegressor()
best_model.fit(X, y)
y_pred = best_model.predict(X)
mape = mean_absolute_percentage_error(y, y_pred)
print("Mean absolute percentage error on training set: {:.2f}%".format(100 * mape))