In [3]:
import pandas as pd
import numpy as np 

import spacy

### load data

In [4]:

df_train = pd.read_csv('Amazon review dataset/train.csv', header=None, names=['polarity', 'title', 'review'] )

In [5]:
df_train.sample(5)

Unnamed: 0,polarity,title,review
1162981,2,b- lynch maaaaaaaaaaaaaaaaannn,Lynch is back in that ????????? U know what im...
1347597,1,LOW QUALITY VERY EARLY RECORDINGS!!!,THIS SHONGS ARE NOT THE GREATEST HITS OF THIS ...
512958,1,YALL MUST BE KIDDING...,"ok first off, if you come by my review, let it..."
2081008,1,surf's up...NOT,The ONLY reason that I gave this game two (2) ...
1858731,2,Decent adventure with some excellent highlights,"A decent, well-plotted adventure. I could have..."


### check class imbalance

In [6]:
df_train.polarity.value_counts()

polarity
2    1800000
1    1800000
Name: count, dtype: int64

No class imbalance

In [7]:
# check missing value in review column 
len(df_train[df_train.review.isnull()])

0

In [None]:
df_train.review.isnull().any()

In [None]:
# check missing value in title column 
len(df_train[df_train.title.isnull()])

a 207 NaN value in title, this will create a problem if we planning to combine the title nad review in a single row 

In [None]:
# map polarity : positive ->0 | negative -> 1
df_train['label'] = df_train.polarity.map({2:0, 1:1})
df_train.sample(3)

In [None]:
# combine review title with review body 
# fillna : replaces any NaN values in the title  an empty string 

df_train['full_review'] = df_train.title.fillna('') + " " + df_train.review 

In [None]:
df_train.head()

In [None]:
# to check that the new 'full_review' is not affected by the Nan values in title
df_train[df_train.title.isnull()]

In [None]:
df_train.title[0]

In [None]:
df_train.review[0]

In [None]:
df_train.full_review[0]

In [None]:
# check dublicate 

duplicate_rows = df_train.duplicated() 
duplicate_rows.any() 

No Dublicates in the data 

In [None]:
len(df_train.full_review[0])

In [None]:
len(df_train.full_review[100])

In [None]:
len(df_train.full_review[1])

In [None]:
# check avergae lenght of a full review 
full_review_avg_length = np.mean([ len(df_train.full_review[0]) for i in range(len(df_train.full_review))])
print(full_review_avg_length) 

In [None]:
# check for very short reviews 

short_reviews = df_train[df_train['full_review'].apply(lambda x: len(x.split()) < 10)]

short_reviews.label.value_counts()

print(f"Number of short reviews: {len(short_reviews)}")

In [None]:
short_reviews

In [None]:
short_reviews.full_review.iloc[0]

There is a lot of garbage and noise that need our attention 

### a random sample of train_df for ease of experiments

In [None]:
# a lighter df to experiment with 
df_train_sample  = df_train.sample(100000)

In [None]:
df_train_sample.label.value_counts() 

In [None]:
len(df_train_sample)

### Remove URL and HTML tags if any

In [None]:
from bs4 import BeautifulSoup
has_html_tags = df_train_sample['full_review'].apply(lambda x: bool(BeautifulSoup(x, "html.parser").find(True)))


In [None]:
df_train_sample[has_html_tags].full_review.iloc[1]

there is indeed html tags that will require our attention 

In [None]:
# check for urls 
import re
has_url = [bool(re.findall(r'http\S+', df_train_sample.full_review.iloc[i])) for i in range(len(df_train_sample))]
df_train_sample[has_url]

both URL and HTML tags do not affect the feeling of the customer, hence they are just noise in our case.

### Preprocessing Steps:

1. **Remove URLs**: Strips any URLs from the review, as they don't contribute meaningfully to sentiment analysis.

2. **Remove Gibberish & Excessive Repeated Characters**: Identifies and removes sequences with too many repeated characters (e.g., "ooooo" becomes "o") and repeated words (e.g., "great great" becomes "great").

3. **Remove Punctuation, Numbers, and Long Words**: Filters out punctuation, numbers, and excessively long words (over 20 characters).

4. **Lowercase Text**: Converts all text to lowercase.

5. **Remove Stopwords (except "not", "no", and "nor")**: Removes common stopwords except negations to preserve their importance.

6. **Lemmatization**: Converts words to their base form (e.g., "running" to "run").


In [None]:
import re
import spacy
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

# Load Spacy
nlp = spacy.load('en_core_web_lg')

# Custom stopwords list to keep "not", "no", "nor"
stopwords_to_keep = {"not", "no", "nor"}
custom_stopwords = nlp.Defaults.stop_words - stopwords_to_keep

# Function to detect gibberish 
def remove_gibberish(text):
    # Remove excessive repeated characters (e.g., 'ooooo' -> 'o')
    text = re.sub(r'(.)\1{3,}', r'\1', text)
    # Remove repeated words (e.g., "great great" -> "great")
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)
    return text

# Function to preprocess text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Handle gibberish or repeated patterns
    text = remove_gibberish(text)
    
    # Process with spaCy
    doc = nlp(text)

    tokens = []
    for token in doc:
        # Skip punctuation, digits, or tokens that are excessively long
        if token.is_punct or token.is_digit or len(token.text) > 20:
            continue
        # Skip stopwords except "not", "no", "nor"
        if token.text not in custom_stopwords:
            # Lemmatize and lowercase valid words
            lemma = token.lemma_.lower()
            # Keep only valid words
            if lemma.isalpha():
                tokens.append(lemma)
    
    return " ".join(tokens)

# Combine everything into a pipeline
def combined_preprocessing(text_series):
    return text_series.apply(preprocess_text)

# Create the scikit-learn pipeline
pipeline = Pipeline([
    ('preprocess', FunctionTransformer(combined_preprocessing))
])



In [None]:
# test on short_reviews dataframe 
cleaned_reviews = pipeline.fit_transform(short_reviews.full_review)
print(f' before: {short_reviews.full_review[:1].values} || after: {cleaned_reviews[:1].values} ')
print(f' before: {short_reviews.full_review[5:6].values} || after: {cleaned_reviews[5:6].values} ')


In [None]:
cleaned_reviews

In [None]:
import numpy as np 

In [None]:
np.__version__