In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


We will begin our model testing by splitting data into train and test parts.

In [2]:
df=pd.read_csv('train_data_after_EDA.csv')

In [20]:
X=df['cleaned_tokens']
y=df['target']
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

# MODEL TESTING

We will test these models and vectorizers:
- Models:
    - Random Forest
    - SVM
    - Logistic Regression
    - Multinomial NB
- Vectorizers:
    - Count Vectorizer
    - CBow
    - Tfidf Vectorizer
    - Skipgram Vectorizer


In [21]:
class GensimVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_type='cbow', size=100, window=5, min_count=1, workers=4):
        self.model_type = model_type
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers

    def fit(self, X, y=None):
        model_type = 0 if self.model_type == 'cbow' else 1
        self.model = Word2Vec(X, vector_size=self.size, window=self.window, min_count=self.min_count, workers=self.workers, sg=model_type)
        return self

    def transform(self, X):
        return np.array([np.mean([self.model.wv[word] for word in sentence if word in self.model.wv] or [np.zeros(self.size)], axis=0) for sentence in X])

In [22]:
# Data prepare
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Models
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    #'Multinomial NB': MultinomialNB()
}

In [27]:
# Vectorizers
vectorizers = {
    'TFidf': TfidfVectorizer(),
    'Count': CountVectorizer(),
    'Skipgram': GensimVectorizer(model_type='skipgram'),
    'CBow': GensimVectorizer(model_type='cbow')
}

In [28]:
results = []

for vec_name, vectorizer in vectorizers.items():
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', model)
        ])
        
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)
        
        report = classification_report(y_test, predictions, output_dict=True)
        results.append({
            'Vectorizer': vec_name,
            'Model': model_name,
            'Report': report
        })

In [32]:
results_df = pd.DataFrame(results)
#import ace_tools as tools; tools.display_dataframe_to_user(name="Model Testing Results", dataframe=results_df)
print(results_df)

   Vectorizer                Model  \
0       TFidf        Random Forest   
1       TFidf                  SVM   
2       TFidf  Logistic Regression   
3       Count        Random Forest   
4       Count                  SVM   
5       Count  Logistic Regression   
6    Skipgram        Random Forest   
7    Skipgram                  SVM   
8    Skipgram  Logistic Regression   
9        CBow        Random Forest   
10       CBow                  SVM   
11       CBow  Logistic Regression   

                                               Report  
0   {'0': {'precision': 0.782608695652174, 'recall...  
1   {'0': {'precision': 0.7783300198807157, 'recal...  
2   {'0': {'precision': 0.7866666666666666, 'recal...  
3   {'0': {'precision': 0.7790224032586558, 'recal...  
4   {'0': {'precision': 0.7810650887573964, 'recal...  
5   {'0': {'precision': 0.7927736450584485, 'recal...  
6   {'0': {'precision': 0.6929057337220602, 'recal...  
7   {'0': {'precision': 0.5923076923076923, 'recal...  
8