In [1]:
!wget https://d396qusza40orc.cloudfront.net/dataminingcapstone/Task6/Hygiene.tar.gz
    
!tar -xvf /kaggle/working/Hygiene.tar.gz

--2024-04-19 05:55:25--  https://d396qusza40orc.cloudfront.net/dataminingcapstone/Task6/Hygiene.tar.gz
Resolving d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)... 52.84.160.76, 52.84.160.159, 52.84.160.182, ...
Connecting to d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)|52.84.160.76|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39134299 (37M) [application/x-gzip]
Saving to: 'Hygiene.tar.gz'


2024-04-19 05:55:26 (66.0 MB/s) - 'Hygiene.tar.gz' saved [39134299/39134299]

Hygiene/
Hygiene/hygiene.dat.additional
Hygiene/hygiene.dat
Hygiene/hygiene.dat.labels


In [2]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import spacy
import nltk

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


# Summarization:

## Text Preprocessing
- normalization: lowercase, lemmatization
- removal: irrelevant characters(x): stop_words include 'not'
- tokenization: N-gram(2-4)

## Feature engineering
+ TF-IDF(uni-gram, bi-gram)
+ Word embedding[Glove]
+ feature selection(to do)

## Model selections
+ Logistic regression
+ random forest
+ SVM

## Criteria:
+ CV
+ roc_auc_score
+ F1_score

## Text Preprocessing

In [3]:
base_path = '/kaggle/working/Hygiene'

reviews_path = os.path.join(base_path, 'hygiene.dat')
labels_path = os.path.join(base_path, 'hygiene.dat.labels')

In [4]:
N = 546

with open(reviews_path, 'r') as f:
    data = [next(f) for x in range(N)]
    
with open(labels_path, 'r') as f:
    y = [next(f) for x in range(N)]  
    
y = [s.replace('\n', '') for s in y]
y = [int(s) for s in y]


In [5]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS  # Import stop words

# Load spaCy model 
nlp = spacy.load("en_core_web_sm")

def preprocess(lst):
    """Performs lowercasing, lemmatization, and stop word removal on a list of strings.

    Args:
        lst (list): A list of strings to preprocess.

    Returns:
        list: A list of preprocessed tokens.
    """
    lst = [x.replace('\n', '') for x in lst]
    lower_lst = [s.lower() for s in lst]  # Lowercasing
    doc_lst = [nlp(text) for text in lower_lst]  # Create spaCy Doc objects

    # Lemmatization and stop word removal
    filtered_lst = [[token.lemma_ for token in doc] for doc in doc_lst]
    return filtered_lst


In [6]:
data_tokens = preprocess(data)


### Features engineering

1. statistic features: tf-idf

In [7]:
tf_idf_uni = TfidfVectorizer(stop_words='english').fit_transform(data)
tf_idf_bi = TfidfVectorizer(stop_words='english', ngram_range=(2,2)).fit_transform(data)

2. Word Embedding: Glove

In [None]:
import gensim.downloader as api

model = api.load('conceptnet-numberbatch-17-06-300') 


In [None]:
len(text_vec_lst[2])

In [None]:
token_vec_lst = []
text_vec_lst = []

for text in data_tokens:
    for token in text:
        try: 
            token_vec_lst.append(model[token])
        except: 
            continue
    
    text_vec_lst.append(sum(token_vec_lst)/len(token_vec_lst))
    token_vec_lst = []
    
data_ft = np.array(text_vec_lst)

In [None]:
data_ft.shape

### Models Building:


1. Logistic regression
2. Random Forest(ensemble bagging learning)
3. SVM

In [None]:
LR_classifier = LogisticRegression(random_state=0, solver='liblinear') 
RF_classifier = RandomForestClassifier(n_estimators=100, random_state=1875)
SVM_classifier = SVC(kernel='linear', C=1.0, random_state=1875) 

### Criteria setting

1. CV for F1_score and ROC_AUC

In [None]:
vectorizations = []
vectorizations.append(tf_idf_uni)
vectorizations.append(tf_idf_bi)
vectorizations.append(data_ft)

models = []
models.append(LR_classifier)
models.append(RF_classifier)
models.append(SVM_classifier)

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression

y = y


# Evaluate using accuracy
# 3 vectorizations
for i in range(3):
    X = vectorizations[i]
    
    # 3 models
    for j in range(3):
        model = models[j]
        
        # 5-fold cross-validation        
        cv = KFold(n_splits=5, shuffle=True, random_state=42) 

        scores = cross_val_score(model, X, y, scoring='f1', cv=cv)
        scores2 = cross_val_score(model, X, y, scoring='roc_auc', cv=cv)
        print(f"Average F1_SCORE: {scores.mean()}\nAverage ROC_AUC_SCORE: {scores2.mean()}")

## RESULT:
+ tf_idf_uni as vectorization of data
+ random_forest as model