# Imports

In [None]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# additional imports for the Textual features
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# NLP: Bag of Words & Text Classification Tasks

## The Data  

We will use the **Women’s Clothing E-Commerce dataset** , which is revolving around the reviews written by customers.


* **Review Text:** String variable for the review body.

* **Recommended:** Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not recommended.


## Task - "EDA"
1. Load the dataset`Womens_Clothing_E-Commerce_Reviews.csv` into a pandas DataFrame.
* You can use any other public dataset!
2. Drop any unnecessary columns
3. Print the number of rows and columns in the dataset.
4. For each column, calculate:
   - The number of **unique values**
   - The number of **missing values**
5. Display the result in a summary table for quick inspection.

 This task helps you understand the dataset structure, spot missing values, and plan preprocessing accordingly.


In [None]:
# Read the data
df = pd.read_csv("Womens_Clothing_E-Commerce_Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Review Text,Recommended
0,0,Absolutely wonderful - silky and sexy and comf...,1
1,1,Love this dress! it's sooo pretty. i happene...,1
2,2,I had such high hopes for this dress and reall...,0
3,3,"I love, love, love this jumpsuit. it's fun, fl...",1
4,4,This shirt is very flattering to all due to th...,1


In [None]:
df = df.drop(columns=['Unnamed: 0'])

In [None]:
# Extracting Missing Count and Unique Count by Column
unique_count = []
for x in df.columns:
    unique_count.append([x,len(df[x].unique()),df[x].isnull().sum()])

print("Dataframe Dimension: {} Rows, {} Columns".format(*df.shape))
pd.DataFrame(unique_count, columns=["Column","Unique","Missing"]).set_index("Column").T

Dataframe Dimension: 22641 Rows, 2 Columns


Column,Review Text,Recommended
Unique,22634,2
Missing,0,0


## Task - Split Train-Test

1. Split your dataset into **training** and **test** sets  (80% train / 20% test)
2. Extract the textual data from the column `'Review Text'` into two variables:
   - `x_train_textual`
   - `x_test_textual`

2. Create two DataFrames:
   - `train_text_df` — will hold both the raw and preprocessed review texts for the train set
   - `test_text_df` — same for the test set

Each DataFrame should have two columns:
- `'raw text'`: the original review
- `'preprocessed text'`: the cleaned review (to be filled in the next task)


In [None]:
target_col = 'Recommended'

In [None]:
# separate our target column from the feature
x = df.drop(columns=[target_col], inplace=False)
y = df[target_col]
print(f'x shape: {x.shape}, y shape: {y.shape}')

x shape: (22641, 1), y shape: (22641,)


In [None]:
# split train test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)
print(f'x train: {len(x_train)}, x test: {len(x_test)}, \ny train: {len(y_train)}, y test: {len(y_test)}')

x train: 18112, x test: 4529, 
y train: 18112, y test: 4529


In [None]:
# define x_train_textual, x_test_textual. Take the relevant column from x_train, x_test

x_train_textual = x_train['Review Text']
x_test_textual = x_test['Review Text']

print(x_train_textual)

10426    Super cute in raspberry and very flattering! i...
7546     This shell can be worn casual or dressy depend...
4914     Too tight in strange ways. beautiful dress i w...
12006    This top has great detailing and color. does r...
187      Finally a "swing top" that doesn't look like a...
                               ...                        
13123    I wasn't expecting to love to this dress as mu...
19648    I received the pants and they are really nice....
9845     When i first put this dress on, i immediately ...
10799    This jacket is stylish and unique. love the li...
2732     I was very excited about receiving this dress....
Name: Review Text, Length: 18112, dtype: object


In [None]:
# define 2 dfs to hold the text before and after preprocessing, one for train set and one for test set
train_text_df = pd.DataFrame(columns=['raw text', 'preprocessed text'])
test_text_df = pd.DataFrame(columns=['raw text', 'preprocessed text'])

## Task - Text Preprocessing

Now preprocess the reviews in both `x_train_textual` and `x_test_textual`.

Your preprocessing pipeline should include:

- Lowercasing
- Tokenization (by splitting on spaces)
- Stopword removal
- **Stemming** using NLTK’s `PorterStemmer`
- Join the tokens back into a single string

Additional instructions:

- Use NLTK’s stopword list.
- Exclude the words `"no"` and `"not"` from the stopwords list (to preserve negation).
- Apply the pipeline separately for the train and test sets.
- Store the results in the appropriate `'preprocessed text'` column in `train_text_df` and `test_text_df`.

Feel free to use the code from the slides.

In [None]:
# preprocess the data - do this separately for train and test sets.
# Make sure you are iterating over the right items: x_train_textual and
#x_test_textual and saving results in train_text_df, test_text_df
#Think if you want to use the given Stop Words or define/exluce words by yourself.

# initialize stemmer
ps = PorterStemmer()

# load stopwords from nltk
nltk_stopwords =set(stopwords.words('english'))

not_stopwords = {'no', 'not'}
our_stop_words = set([word for word in nltk_stopwords if word not in not_stopwords])

#-----Train set
for i, raw_review in x_train_textual.items():
    review = raw_review.lower()
    review = review.split() # tokenization by word - split string by spaces
    review = [ps.stem(word) for word in review if not word in our_stop_words] # stopwords removal + stemming
    review = ' '.join(review) # combine tokens back to a single string of a review
    train_text_df.loc[i] = {'raw text': raw_review, 'preprocessed text': review}

train_text_df.head()

Unnamed: 0,raw text,preprocessed text
10426,Super cute in raspberry and very flattering! i...,super cute raspberri flattering! pair revers t...
7546,This shell can be worn casual or dressy depend...,shell worn casual dressi depend pair with. enjoy!
4914,Too tight in strange ways. beautiful dress i w...,tight strang ways. beauti dress excit receiv i...
12006,This top has great detailing and color. does r...,"top great detail color. run littl big, add sty..."
187,"Finally a ""swing top"" that doesn't look like a...","final ""swing top"" look like sack me! want part..."


In [None]:
#-----Test set
for i, raw_review in x_test_textual.items():
    review = raw_review.lower()
    review = review.split() # tokenization by word - split string by spaces
    review = [ps.stem(word) for word in review if not word in our_stop_words] # stopwords removal + stemming
    review = ' '.join(review) # combine tokens back to a single string of a review
    test_text_df.loc[i] = {'raw text': raw_review, 'preprocessed text': review}

test_text_df.head()

Unnamed: 0,raw text,preprocessed text
8321,Beautiful and very versatile dress. extremely ...,beauti versatil dress. extrem forgiv post babi...
6633,"I don't have anything like it in my closet, re...","anyth like closet, realli excit wear"
11405,"This is a cute dress, but the buttons are tiny...","cute dress, button tini top one crack half cou..."
9778,I bought this shirt at the store and after goi...,"bought shirt store go home tri on, promptli we..."
8703,"I love the paisley print, very soft and comfy,...","love paisley print, soft comfy, flutter sleev ..."


## Task - Features Extraction

In this task, you’ll convert the preprocessed text into numeric features using `BoW` (or `TF-IDF`).

Your steps:

1. Extract the `'preprocessed text'` column from both `train_text_df` and `test_text_df`, and store them in:
   - `processed_train`
   - `processed_test`

2. Initialize a `CountVectorizer`.

3. Fit the vectorizer only on the training set (`processed_train`).

4. Transform both the training and test sets using the fitted vectorizer:
   - `x_train_textual = cv.transform(processed_train)`
   - `x_test_textual = cv.transform(processed_test)`

5. Convert the results to dense numpy arrays using `.toarray()` for later compatibility with classical models.

6. Print the shape and a sample of the resulting feature vectors to validate your work.



In [None]:
# get the processed column for both train and test sets (from text_df)

processed_train = train_text_df['preprocessed text']
processed_test = test_text_df['preprocessed text']

In [None]:
# define feature extraction method
cv = CountVectorizer(binary=True)

# fit the method over train set
cv.fit(processed_train)

# transform train and test sets using the method
x_train_textual = cv.transform(processed_train).toarray()
x_test_textual = cv.transform(processed_test).toarray()

# sanity check - print results
print('train shape: ', x_train_textual.shape, 'test shape: ', x_test_textual.shape)
print(x_train_textual)

train shape:  (18112, 12028) test shape:  (4529, 12028)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Task - The Model

Now that you’ve preprocessed the text and converted it into numerical features, it's time to **train a classification model**.

Your goal is to build a model that can **predict the target label** using the feature matrix `x_train_textual`.

Instructions:

1. Select and train a classification model of your choice (e.g.`LogisticRegression`)

2. Fit the model on the **training set**.

3. Use it to predict on the **test set**.

4. Evaluate your model using

In [None]:
model = LogisticRegression()
model.fit(x_train_textual, y_train)
score = model.score(x_test_textual, y_test)
print(f'Model accuracy: {score}')

Model accuracy: 0.8831971737690439


In [None]:
y_pred = model.predict(x_test_textual)

In [None]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.75      0.60      0.67       877
           1       0.91      0.95      0.93      3652

    accuracy                           0.88      4529
   macro avg       0.83      0.78      0.80      4529
weighted avg       0.88      0.88      0.88      4529

