### Starter code to extract data from .tgz file to begin with EDA, buliding NLP models

In [44]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tarfile # this is to extract the data from that .tgz file

In [2]:
# get all of the data out of that .tgz
amazon_reviews = tarfile.open('/kaggle/input/amazon-reviews/amazon_review_polarity_csv.tgz')
amazon_reviews.extractall('data')
amazon_reviews.close()

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amazon-reviews/amazon_review_polarity_csv.tgz
/kaggle/input/amazon-reviews/train.csv
/kaggle/input/amazon-reviews/test.csv


In [4]:
import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/amazon_review_polarity_csv/train.csv
./data/amazon_review_polarity_csv/test.csv
./data/amazon_review_polarity_csv/readme.txt


# load data

In [7]:
# check out what the data looks like before you get started
# look at the training data set
train_df = pd.read_csv('./data/amazon_review_polarity_csv/train.csv',header=None, names=['polarity', 'title', 'review'])
train_df.head() 

Unnamed: 0,polarity,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [45]:
train_df.shape

(3600000, 5)

In [46]:
# map polarity : positive ->0 | negative -> 1
train_df['label'] = train_df.polarity.map({2:0, 1:1})
train_df.sample(3)

Unnamed: 0,polarity,title,review,label,full_review
3355153,2,Great Little Gardening Secrets!,Whether you are a seasoned gardener or just be...,0,Great Little Gardening Secrets! Whether you ar...
970674,1,"great historical setting, horrible characters","Great sex scenes and the build up of tension, ...",1,"great historical setting, horrible characters ..."
3116039,1,Yawn. Blah. Gimme a break.,"She's easy on the eyes, even easy on the ears,...",1,Yawn. Blah. Gimme a break. She's easy on the e...


In [13]:
train_df['full_review'] = train_df.title.fillna('') + " " + train_df.review 

In [14]:
train_df.head() 

Unnamed: 0,polarity,title,review,label,full_review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,0,Stuning even for the non-gamer This sound trac...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,0,The best soundtrack ever to anything. I'm read...
2,2,Amazing!,This soundtrack is my favorite music of all ti...,0,Amazing! This soundtrack is my favorite music ...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,0,Excellent Soundtrack I truly like this soundtr...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",0,"Remember, Pull Your Jaw Off The Floor After He..."


## Train/test split

In [47]:
from sklearn.model_selection import train_test_split

train_df_sample  = train_df.sample(200000, random_state = 42)

X_train,X_test, y_train, y_test = train_test_split(
    train_df_sample.full_review,
    train_df_sample.label,
    random_state=42
)

X_train.shape, X_test.shape

((150000,), (50000,))

## Data preprocessing

In [23]:
import re
import spacy
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler


nlp = spacy.load('en_core_web_lg')

# Custom stopwords list to keep "not", "no", "nor"
stopwords_to_keep = {"not", "no", "nor"}
custom_stopwords = nlp.Defaults.stop_words - stopwords_to_keep

# Function to detect gibberish 
def remove_gibberish(text):
    # Remove excessive repeated characters (e.g., 'ooooo' -> 'o')
    text = re.sub(r'(.)\1{3,}', r'\1', text)
    # Remove repeated words (e.g., "great great" -> "great")
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)
    return text


def preprocess_text(text):
    # Remove URLs and apply gibberish removal
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = remove_gibberish(text)
    
    doc = nlp(text)
    
    tokens = [token.lemma_.lower() for token in doc if not (
        token.is_punct or token.is_digit or len(token.text) > 20 or token.text in custom_stopwords)]
    
    clean_text = " ".join(tokens)
    
    return nlp(clean_text).vector


def combined_preprocessing(text_series):
    return text_series.apply(preprocess_text)

preprocess_pipe = Pipeline([
    ('preprocess', FunctionTransformer(combined_preprocessing)),
    #('scaling'), MinMaxScaler() 
])



In [48]:
X_train_pre = preprocess_pipe.fit_transform(X_train)
X_test_pre = preprocess_pipe.fit_transform(X_test)

import numpy as np
X_train_st = np.stack(X_train_pre)
X_test_st = np.stack(X_test_pre)

## base line model

In [49]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import make_pipeline


model = make_pipeline(
    MinMaxScaler(),
    MultinomialNB()
)

model.fit(X_train_st, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('multinomialnb', MultinomialNB())])

In [50]:
from sklearn.metrics import classification_report

print(classification_report(y_test, model.predict(X_test_st))) 

              precision    recall  f1-score   support

           0       0.76      0.74      0.75     24954
           1       0.75      0.77      0.76     25046

    accuracy                           0.75     50000
   macro avg       0.75      0.75      0.75     50000
weighted avg       0.75      0.75      0.75     50000



In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


model = make_pipeline(
    MinMaxScaler(),
    LogisticRegression() 
)

model.fit(X_train_st, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('logisticregression', LogisticRegression())])

In [52]:
from sklearn.metrics import classification_report

print(classification_report(y_test, model.predict(X_test_st))) 

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     24954
           1       0.85      0.86      0.85     25046

    accuracy                           0.85     50000
   macro avg       0.85      0.85      0.85     50000
weighted avg       0.85      0.85      0.85     50000



In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb



models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "AdaBoost": AdaBoostClassifier(),
    #"Naive Bayes": GaussianNB(),
    "MLP Neural Network": MLPClassifier(max_iter=1000)
}

results = {}
for model_name, model in models.items():
    model = make_pipeline(
                            MinMaxScaler(),
                            model )

    scores = cross_val_score(model, X_train_st, y_train, cv=5, scoring='f1_macro')  
    results[model_name] = scores.mean()

results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Mean Accuracy'])

print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
