In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data
# df = pd.read_csv("./train.csv")
df = pd.read_csv(filepath_or_buffer='train.csv', engine='python', error_bad_lines=False) # while using colab - only for testing, not for prod.



  df = pd.read_csv(filepath_or_buffer='train.csv', engine='python', error_bad_lines=False)
Skipping line 14277: unexpected end of data


In [3]:
# Print the number of missing values in each column
print(df.isnull().sum())

PRODUCT_ID            0
TITLE                 0
BULLET_POINTS      5312
DESCRIPTION        7400
PRODUCT_TYPE_ID       0
PRODUCT_LENGTH        0
dtype: int64


In [4]:
# Drop any rows with missing values (if mode values cause instability, drop all missin values nly for test)
# df.dropna(inplace=True)

In [5]:
# Impute missing values in the TITLE column with the mode
df['TITLE'].fillna(df['TITLE'].mode()[0], inplace=True)

In [6]:
# Drop rows with missing values in the BULLET_POINTS and DESCRIPTION columns
df.dropna(subset=['BULLET_POINTS', 'DESCRIPTION'], inplace=True)

In [7]:
# Verifying missing values
print(df.isnull().sum())

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64


In [8]:
# Visualizing the df
# refer to Colbaoratory for visualizing the df --> https://colab.research.google.com/drive/1szNvND0e7GlUPA98WiQYwcUo2VZ9De9C?usp=sharing

In [9]:
# Preprocess the text data
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
# Function to preprocess the text data
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    preprocessed_text = " ".join(stemmed_tokens)
    
    return preprocessed_text

In [11]:
# Preprocessing the text data in the TITLE, DESCRIPTION, and BULLET_POINTS columns
df["TITLE"] = df["TITLE"].apply(preprocess_text)
df["DESCRIPTION"] = df["DESCRIPTION"].apply(preprocess_text)
df["BULLET_POINTS"] = df["BULLET_POINTS"].apply(preprocess_text)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the feature columns
text_columns = ["TITLE", "DESCRIPTION", "BULLET_POINTS"]
categorical_columns = ["PRODUCT_TYPE_ID"]

# Vectorize the text data
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(df[text_columns].apply(lambda x: " ".join(x), axis=1))

# One-hot encode the categorical features
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(df[categorical_columns])

# Combine the features
text_feature_names = vectorizer.get_feature_names_out()
categorical_feature_names = encoder.get_feature_names_out(categorical_columns)
feature_names = list(text_feature_names) + list(categorical_feature_names)
X = np.concatenate((text_features.toarray(), categorical_features.toarray()), axis=1)
y = df["PRODUCT_LENGTH"].values

# Define a column transformer to apply the same preprocessing to new data
preprocessor = ColumnTransformer([
    ("text", vectorizer, text_columns),
    ("cat", encoder, categorical_columns)
])


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

# Evaluate the performance of different models using cross-validation
models = [
    LinearRegression(),
    Lasso(),
    Ridge(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor()
]
for model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_percentage_error")
    print(type(model).__name__)
    print("Mean absolute percentage error: {:.2f}%".format(-100 * np.mean(scores)))
    print()

# Train and evaluate the best model on the entire dataset
best_model = GradientBoostingRegressor()
best_model.fit(X, y)
y_pred = best_model.predict(X)
mape = mean_absolute_percentage_error(y, y_pred)
print("Mean absolute percentage error on entire dataset: {:.2f}%".format(100 * mape))


LinearRegression
Mean absolute percentage error: 160277789815.89%

Lasso
Mean absolute percentage error: 735.12%

Ridge
Mean absolute percentage error: 2635.18%

DecisionTreeRegressor
Mean absolute percentage error: 619.01%

