In [159]:
# Importing essential libraries for data manipulation and numerical operations
import pandas as pd
import numpy as np
import multiprocessing  # For parallel processing to speed up tasks
import gensim  # For word embeddings and other NLP tasks
import nltk  # For natural language processing tasks like tokenization and stemming
import spacy  # For advanced NLP tasks like named entity recognition and dependency parsing

# Importing a custom utility for creating mean word embeddings
from UtilWordEmbedding import MeanEmbeddingVectorizer

# Importing tools from scikit-learn for building machine learning pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer  # For transforming columns of data
from sklearn.base import BaseEstimator, TransformerMixin  # Base classes for custom transformers

# Importing text preprocessing functions from gensim
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_numeric, strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces, strip_non_alphanum, remove_stopwords, stem_text

# Importing the Word2Vec model from gensim for creating word embeddings
from gensim.models.word2vec import Word2Vec
# Importing lemmatizers and stemmers from nltk for word normalization
from nltk.stem import WordNetLemmatizer, SnowballStemmer

# Importing tools from scikit-learn for encoding categorical variables and converting text data into numerical features
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer

# Importing various machine learning algorithms from scikit-learn and xgboost for classification tasks
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Importing tools for evaluating and selecting machine learning models
from sklearn import metrics  # For performance evaluation
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.model_selection import cross_val_score  # For cross-validation
from sklearn.model_selection import KFold, GridSearchCV  # For K-Fold cross-validation and hyperparameter tuning

# Importing tqdm for displaying progress bars during iterative processes
from tqdm import tqdm

# Importing stopwords from nltk and setting them up
from nltk.corpus import stopwords 
STOP_WORDS = set(stopwords.words('english'))

# Setting options for pandas to adjust how data frames are displayed
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Enabling inline plotting in Jupyter notebooks and setting a better quality for inline plots
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Setting a random seed to ensure reproducibility of results
SEED=26



In [160]:
#nltk.download('wordnet')

In [161]:

# Paths to the hygiene dataset files
hygiene_data_path = "C:\\Users\\kasam\\OneDrive\\Desktop\\data\\Hygiene\\hygiene.dat"  # Path to the hygiene data file
hygiene_labels_path = "C:\\Users\\kasam\\OneDrive\\Desktop\\data\\Hygiene\\hygiene.dat.labels"  # Path to the hygiene labels file
hygiene_extra_path = "C:\\Users\\kasam\\OneDrive\\Desktop\\data\\Hygiene\\hygiene.dat.additional"  # Path to the hygiene additional data file


In [162]:
#Data Prep (Stop words removal, cleaning and Tokenization, steaming and Lemanization

In [163]:
# List of preprocessing filters to apply to text
TEXT_FILTERS = [
    lambda text: text.lower(),  # Convert text to lowercase
    strip_tags,  # Remove HTML tags
    strip_punctuation,  # Replace punctuation characters with spaces
    strip_multiple_whitespaces,  # Remove repeating whitespaces
    gensim.parsing.preprocessing.remove_stopwords,  # Remove stopwords
    strip_short,  # Remove words less than minsize=3 characters long
    stem_text  # Stem the text
]

def preprocess_text(input_text):
    """
    Preprocess the input text by applying a series of filters and lemmatizing the tokens.
    
    Filters applied:
    - Convert text to lowercase
    - Remove HTML tags
    - Replace punctuation characters with spaces
    - Remove repeating whitespaces
    - Remove stopwords
    - Remove words less than 3 characters long
    - Stem the text
    
    Parameters:
    input_text (str): The input text to preprocess.
    
    Returns:
    list: A list of lemmatized tokens.
    """
    lemmatized_tokens = []
    for token in gensim.parsing.preprocessing.preprocess_string(input_text, TEXT_FILTERS):
        lemmatized_tokens.append(WordNetLemmatizer().lemmatize(token))
    return lemmatized_tokens



In [164]:
import time
from tqdm import tqdm

# Start timing the execution
start_time = time.time()

# Variables for storing raw and processed texts
raw_texts = []
processed_texts = []

# Read lines from the hygiene data file
with open(hygiene_data_path) as file:
    raw_texts = file.readlines()

# Preprocess each text in raw_texts and append the results to processed_texts
for text in tqdm(raw_texts):
    stemmed_result = preprocess_text(text)
    processed_texts.append(stemmed_result)

# Combine processed tokens into single strings for each text
final_processed_texts = [" ".join(text) for text in processed_texts]

# End timing the execution
end_time = time.time()

# Print the execution time
print(f"Execution time: {end_time - start_time} seconds")



100%|████████████████████████████████████████████████████████████████████████████| 13299/13299 [02:33<00:00, 86.61it/s]


Execution time: 158.80067038536072 seconds


In [165]:
import pandas as pd

N = 546

# Read labels from the hygiene labels file
with open(hygiene_labels_path, 'r') as file:
    label_list = [line.rstrip() for line in file]

# Creating a DataFrame from the processed texts and labels
data_frame = pd.DataFrame({
    "label": label_list,
    "text": raw_texts, 
    "preprocessed_texts": final_processed_texts,
    "tokenized_texts": processed_texts
})

# Reading additional hygiene data
additional_data = pd.read_csv(
    hygiene_extra_path,  
    names=["cuisines_offered", "zipcode", "num_reviews", "avg_rating"],
    dtype={
        "cuisines_offered": str, 
        "zipcode": str,
        "num_reviews": str
    }
)

# Joining the additional data with the main DataFrame
data_frame = data_frame.join(additional_data)

# Rounding and converting the average rating to an integer string
data_frame['avg_rating'] = data_frame['avg_rating'].apply(lambda x: str(int(round(float(x), 0))))

# Displaying information about the DataFrame and the first few rows
print(data_frame.info())
display(data_frame.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13299 entries, 0 to 13298
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   label               13299 non-null  object
 1   text                13299 non-null  object
 2   preprocessed_texts  13299 non-null  object
 3   tokenized_texts     13299 non-null  object
 4   cuisines_offered    13299 non-null  object
 5   zipcode             13299 non-null  object
 6   num_reviews         13299 non-null  object
 7   avg_rating          13299 non-null  object
dtypes: object(8)
memory usage: 831.3+ KB
None


Unnamed: 0,label,text,preprocessed_texts,tokenized_texts,cuisines_offered,zipcode,num_reviews,avg_rating
0,1,"The baguettes and rolls are excellent, and alt...",baguett roll excel haven tri excit dozen plu t...,"[baguett, roll, excel, haven, tri, excit, doze...","['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,1,I live up the street from Betty. &#160;When my...,live street betti 160 sister town spring break...,"[live, street, betti, 160, sister, town, sprin...","['American (New)', 'Restaurants']",98109,21,4
2,1,I'm worried about how I will review this place...,worri review place strongli think bad night pl...,"[worri, review, place, strongli, think, bad, n...","['Mexican', 'Restaurants']",98103,14,3
3,0,Why can't you access them on Google street vie...,access googl street view like medina yarrow po...,"[access, googl, street, view, like, medina, ya...","['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,0,Things to like about this place: homemade guac...,thing like place homemad guacamol varieti tast...,"[thing, like, place, homemad, guacamol, variet...","['Mexican', 'Restaurants']",98102,12,3


In [168]:

import time

start_time = time.time()

train_df = df[df["label"] != "[None]"]
test_df = df[df["label"] == "[None]"]

additional_feats = ["cuisines_offered", "zipcode", "num_reviews", "avg_rating"]

train = train_df[["text"] + additional_feats]
train_preprocessed = train_df[["preprocessed_texts"] + additional_feats]
train_tokenized = train_df[["tokenized_texts"] + additional_feats]
train_labels = train_df["label"].astype(int)  # needed by sklearn

test = test_df[["text"] + additional_feats]
test_preprocessed = test_df[["preprocessed_texts"] + additional_feats]
test_tokenized = test_df[["tokenized_texts"] + additional_feats]
test_labels = test_df["label"]

print(train.shape, train_preprocessed.shape, train_tokenized.shape, train_labels.shape)
print(test.shape, test_preprocessed.shape, test_tokenized.shape, test_labels.shape)
print(train.dtypes, train_preprocessed.dtypes, train_tokenized.dtypes)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")




(546, 5) (546, 5) (546, 5) (546,)
(12753, 5) (12753, 5) (12753, 5) (12753,)
text                object
cuisines_offered    object
zipcode             object
num_reviews         object
avg_rating          object
dtype: object preprocessed_texts    object
cuisines_offered      object
zipcode               object
num_reviews           object
avg_rating            object
dtype: object tokenized_texts     object
cuisines_offered    object
zipcode             object
num_reviews         object
avg_rating          object
dtype: object
Execution time: 3.625620126724243 seconds


In [169]:
display(train.head())
display(train_preprocessed.head())
display(train_tokenized.head())

Unnamed: 0,text,cuisines_offered,zipcode,num_reviews,avg_rating
0,"The baguettes and rolls are excellent, and alt...","['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,I live up the street from Betty. &#160;When my...,"['American (New)', 'Restaurants']",98109,21,4
2,I'm worried about how I will review this place...,"['Mexican', 'Restaurants']",98103,14,3
3,Why can't you access them on Google street vie...,"['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,Things to like about this place: homemade guac...,"['Mexican', 'Restaurants']",98102,12,3


Unnamed: 0,preprocessed_texts,cuisines_offered,zipcode,num_reviews,avg_rating
0,baguett roll excel haven tri excit dozen plu t...,"['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,live street betti 160 sister town spring break...,"['American (New)', 'Restaurants']",98109,21,4
2,worri review place strongli think bad night pl...,"['Mexican', 'Restaurants']",98103,14,3
3,access googl street view like medina yarrow po...,"['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,thing like place homemad guacamol varieti tast...,"['Mexican', 'Restaurants']",98102,12,3


Unnamed: 0,tokenized_texts,cuisines_offered,zipcode,num_reviews,avg_rating
0,"[baguett, roll, excel, haven, tri, excit, doze...","['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,"[live, street, betti, 160, sister, town, sprin...","['American (New)', 'Restaurants']",98109,21,4
2,"[worri, review, place, strongli, think, bad, n...","['Mexican', 'Restaurants']",98103,14,3
3,"[access, googl, street, view, like, medina, ya...","['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,"[thing, like, place, homemad, guacamol, variet...","['Mexican', 'Restaurants']",98102,12,3


In [170]:
#Model Experiment (Naive Bayers, SVM, Logistic Regression , Random Forest, XGBoost)

%%time
from sklearn import preprocessing

pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(max_df=7, token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
         ('text', TfidfVectorizer(
                    stop_words='english',
                    strip_accents='unicode',
                    min_df=3,
                    max_df=0.5,
                    ngram_range=(1, 3),
                    max_features=500), 'preprocessed_texts')],
        remainder='passthrough',
    )),
    ('clf', MultinomialNB())
], verbose=False)

# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# scores = metrics.f1_score(y_test, y_pred)
scores = cross_val_score(pipeline, train_preprocessed, train_labels, cv=5, scoring= 'f1_macro')
print(scores)
print("Average F1-Score: %0.5f" % np.average(scores))

[0.62650104 0.72474747 0.67792317 0.68783693 0.60507246]
Average F1-Score: 0.66442
CPU times: total: 9.95 s
Wall time: 23.1 s


In [171]:

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import numpy as np

# Assuming train_preprocessed and train_labels are defined and contain your data

pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        transformers=[
            ('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
            ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
            ('num_reviews', CountVectorizer(max_df=7, token_pattern=r'\d+'), 'num_reviews'),
            ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
            ('text', TfidfVectorizer(
                        stop_words='english',
                        strip_accents='unicode',
                        min_df=3,
                        max_df=0.5,
                        ngram_range=(1, 3),
                        max_features=500), 'preprocessed_texts')],
        remainder='passthrough'
    )),
    ('clf', MultinomialNB())
], verbose=False)

# Assuming train_preprocessed and train_labels are defined and contain your data
scores = cross_val_score(pipeline, train_preprocessed, train_labels, cv=5, scoring='f1_macro')
print("Cross-validation F1 scores:", scores)
print("Average F1-Score: %0.5f" % np.average(scores))





Cross-validation F1 scores: [0.62650104 0.72474747 0.67792317 0.68783693 0.60507246]
Average F1-Score: 0.66442


In [172]:
import time
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import numpy as np

# Define your data: Assuming 'train' is your feature matrix and 'train_labels' are your target labels

start_time = time.time()

# Define the pipeline
pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        transformers=[
            # CountVectorizer for 'cuisines_offered' column
            ('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
            
            # OneHotEncoder for 'zipcode' column
            ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
            
            # CountVectorizer for 'num_reviews' column, using digits as tokens
            ('num_reviews', CountVectorizer(max_df=7, token_pattern=r'\d+'), 'num_reviews'),
            
            # OneHotEncoder for 'avg_rating' column
            ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
            
            # TfidfVectorizer for 'text' column
            ('text', TfidfVectorizer(
                        stop_words='english',
                        strip_accents='unicode',
                        min_df=3,
                        max_df=0.5,
                        ngram_range=(1, 3),
                        max_features=500), 'text')],
        remainder='passthrough'  # Pass through any remaining columns
    )),
    ('clf', MultinomialNB())  # Multinomial Naive Bayes classifier
], verbose=False)

# Cross-validation to evaluate the pipeline
scores = cross_val_score(pipeline, train, train_labels, cv=5, scoring='f1')
print("Cross-validation F1 scores:", scores)
print("Average F1-Score: %0.5f" % np.average(scores))

# Calculate and print the execution time
end_time = time.time()
execution_time = end_time - start_time
print("Execution time: %0.2f seconds" % execution_time)


Cross-validation F1 scores: [0.62857143 0.72222222 0.68965517 0.66037736 0.54347826]
Average F1-Score: 0.64886
Execution time: 23.31 seconds


In [173]:
# Print the count of unique values in 'num_reviews' column
print("Number of unique values in 'num_reviews':", len(df['num_reviews'].value_counts()))

# Display the value counts for 'num_reviews'
print("\nValue counts for 'num_reviews':")
print(df['num_reviews'].value_counts())

Number of unique values in 'num_reviews': 158

Value counts for 'num_reviews':
num_reviews
1      2193
2      1480
3      1126
4       934
5       767
6       675
7       600
8       484
9       458
10      403
11      352
12      315
13      268
14      246
16      200
15      199
17      185
18      160
19      131
20      125
21      124
22      124
23      107
24      101
25       97
28       86
26       81
27       77
29       68
30       60
32       52
33       51
37       44
36       44
34       43
31       43
35       42
39       39
44       37
38       33
43       30
47       28
42       27
40       26
46       25
45       25
41       23
54       22
59       20
55       15
63       15
49       15
52       15
51       14
62       14
48       14
73       13
57       12
61       11
50       11
56       11
58       10
53       10
66        9
78        9
67        8
89        8
83        8
60        8
65        7
64        6
93        6
69        6
70        6
77        6
76       

In [174]:
# Print the count of unique values in 'cuisines_offered' column
print("Number of unique values in 'cuisines_offered':", len(df['cuisines_offered'].value_counts()))

# Display the value counts for 'cuisines_offered'
print("\nValue counts for 'cuisines_offered':")
print(df['cuisines_offered'].value_counts())

Number of unique values in 'cuisines_offered': 388

Value counts for 'cuisines_offered':
cuisines_offered
['Thai', 'Restaurants']                                                          640
['American (New)', 'Restaurants']                                                596
['American (Traditional)', 'Restaurants']                                        589
['Mexican', 'Restaurants']                                                       572
['Pizza', 'Restaurants']                                                         524
['Vietnamese', 'Restaurants']                                                    465
['Japanese', 'Sushi Bars', 'Restaurants']                                        459
['Sandwiches', 'Restaurants']                                                    430
['Chinese', 'Restaurants']                                                       394
['Italian', 'Pizza', 'Restaurants']                                              327
['Japanese', 'Restaurants']                 

In [175]:
# Print the count of unique values in 'avg_rating' column
print("Number of unique values in 'avg_rating':", len(df['avg_rating'].value_counts()))

# Print the count of unique values in 'zipcode' column
print("Number of unique values in 'zipcode':", len(df['zipcode'].value_counts()))

Number of unique values in 'avg_rating': 5
Number of unique values in 'zipcode': 30


In [176]:
# Start measuring execution time
start_time = time.time()

# Define the pipeline
pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        transformers=[
            ('cuisines_offered', CountVectorizer(), 'cuisines_offered'),
            ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
            ('num_reviews', CountVectorizer(token_pattern=r'\d+'), 'num_reviews'),
            ('avg_rating', CountVectorizer(token_pattern=r'\d+'), 'avg_rating'),
            ('text', TfidfVectorizer(
                        stop_words='english',
                        strip_accents='unicode',
                        min_df=3,
                        max_df=0.5,
                        ngram_range=(1, 3),
                        max_features=500), 'preprocessed_texts')],
        remainder='passthrough'
    )),
    ('clf', MultinomialNB())
], verbose=False)

# Assuming train_preprocessed and train_labels are defined and contain your data
scores = cross_val_score(pipeline, train_preprocessed, train_labels, cv=5, scoring='f1')
print("Cross-validation F1 scores:", scores)
print("Average F1-Score: %0.5f" % np.average(scores))

# Calculate and print the execution time
end_time = time.time()
execution_time = end_time - start_time
print("Execution time: %0.2f seconds" % execution_time)

Cross-validation F1 scores: [0.63551402 0.7027027  0.65486726 0.63636364 0.52272727]
Average F1-Score: 0.63043
Execution time: 17.63 seconds


In [177]:
# Start measuring execution time
start_time = time.time()

# Define the pipeline
pipeline = Pipeline([
    ('union', ColumnTransformer(
        transformers=[
            ('cuisines_offered', CountVectorizer(), 'cuisines_offered'),
            ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
            ('num_reviews', CountVectorizer(token_pattern=r'\d+'), 'num_reviews'),
            ('avg_rating', CountVectorizer(token_pattern=r'\d+'), 'avg_rating'),
            ('text', TfidfVectorizer(
                        stop_words='english',
                        strip_accents='unicode',
                        min_df=15,
                        max_df=0.5,
                        ngram_range=(1, 3),
                        max_features=500), 'text')],
        remainder='passthrough'
    )),
    ('clf', MultinomialNB())
], verbose=False)

# Assuming train and train_labels are defined and contain your data
scores = cross_val_score(pipeline, train, train_labels, cv=5, scoring='f1')
print("Cross-validation F1 scores:", scores)
print("Average F1-Score: %0.5f" % np.average(scores))

# Calculate and print the execution time
end_time = time.time()
execution_time = end_time - start_time
print("Execution time: %0.2f seconds" % execution_time)

Cross-validation F1 scores: [0.63551402 0.69090909 0.64285714 0.64220183 0.50574713]
Average F1-Score: 0.62345
Execution time: 21.36 seconds


In [178]:
#Create function for testing

%%time
def test_classifier(clf, X, y, vectorizer, text_col='text'):
    pipeline = Pipeline([
        ('union', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(max_df=7, token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
         ('text', vectorizer, text_col)],
        remainder='passthrough',
    )),
        ('clf', clf)
    ], verbose=False)
    scores = cross_val_score(pipeline, X, y, cv=5, scoring= 'f1_macro')
    print(clf)
    print(scores)
    cv_score = np.average(scores)
    return cv_score


CPU times: total: 0 ns
Wall time: 0 ns


In [179]:
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': svm.SVC(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=SEED, n_estimators=500, n_jobs=-1),
    #'Gradient Boosting': GradientBoostingClassifier()
    'XGBoost': XGBClassifier(n_estimators=500, 
                            max_depth=5, 
                            learning_rate=0.2, 
                            objective='binary:logistic',
                            scale_pos_weight=2,
                            n_jobs=-1,
                            random_state=SEED)
}

tfidf = TfidfVectorizer(
                    stop_words='english',
                    strip_accents='unicode',
                    min_df=3,
                    max_df=0.5,
                    ngram_range=(1, 3),
                    max_features=500)

bow = CountVectorizer(
    stop_words='english',  # Use 'english' for built-in English stop words
    strip_accents='unicode',
    min_df=15,
    max_df=0.5,
    ngram_range=(1, 3)
)


In [None]:
#BOW - No Preprocessing

In [180]:


import time

# Initialize and fit the CountVectorizer (bow) outside the timing block
bow.fit(train['text'])

for clf_name, clf in classifiers.items():
    start_time = time.time()
    cv_score = test_classifier(clf, train, train_labels, vectorizer=bow, text_col='text')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print('{}: {}, Time: {:.2f} seconds'.format(clf_name, cv_score, elapsed_time))



MultinomialNB()
[0.60750145 0.74309764 0.67879094 0.62385321 0.60066457]
Naive Bayes: 0.6507815623584683, Time: 12.43 seconds
SVC()
[0.58402955 0.58626639 0.58209082 0.61330795 0.54821774]
Support Vector Machine: 0.5827824909909105, Time: 17.92 seconds
LogisticRegression()
[0.57184265 0.56749683 0.5962963  0.61438679 0.65485665]
Logistic Regression: 0.6009758453579122, Time: 13.54 seconds
RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=26)
[0.67860422 0.57189208 0.52813853 0.64026403 0.61464646]
Random Forest: 0.6067090639498269, Time: 29.16 seconds
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
      

In [181]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import time

# Sample classifiers
classifier1 = LogisticRegression()
classifier2 = RandomForestClassifier()
classifier3 = SVC()

# Fit the TF-IDF vectorizer on the training data
tfidf = TfidfVectorizer()
tfidf.fit(train_preprocessed['preprocessed_texts'])

# Define your classifiers
classifiers = {
    'Logistic Regression': classifier1,
    'Random Forest': classifier2,
    'Support Vector Machine': classifier3,
    # Add your other classifiers here
}

# Function to test classifiers
def test_classifier(clf, X, y, vectorizer, text_col):
    X_vec = vectorizer.transform(X[text_col])
    scores = cross_val_score(clf, X_vec, y, cv=5)
    return scores.mean()

# Iterate over classifiers and measure execution time
for clf_name, clf in classifiers.items():
    start_time = time.time()
    cv_score = test_classifier(clf, train_preprocessed, train_labels, vectorizer=tfidf, text_col='preprocessed_texts')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print('{}: {}, Time: {:.2f} seconds'.format(clf_name, cv_score, elapsed_time))





Logistic Regression: 0.6466555462885738, Time: 0.81 seconds
Random Forest: 0.6355129274395329, Time: 7.18 seconds
Support Vector Machine: 0.6502085070892412, Time: 6.18 seconds


In [182]:
%%time
for clf_name, clf in classifiers.items():
    cv_score = test_classifier(clf, train, train_labels, 
                               vectorizer=tfidf, text_col='text')
    print('{}: {}'.format(clf_name, cv_score))


Logistic Regression: 0.5642035029190993
Random Forest: 0.6099249374478732
Support Vector Machine: 0.5421017514595496
CPU times: total: 4.48 s
Wall time: 14.2 s


In [None]:
#TFIDF - No Processing

In [194]:
%%time
for clf_name, clf in classifiers.items():
    cv_score = test_classifier(clf, train_preprocessed, train_labels, 
                               vectorizer=tfidf, text_col='preprocessed_texts')
    print('{}: {}'.format(clf_name, cv_score))

Logistic Regression: 0.6466555462885738
Random Forest: 0.6207673060884071
Support Vector Machine: 0.6502085070892412
CPU times: total: 7.45 s
Wall time: 15.1 s
