In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join("..", "src")))
from scaling import *
from feature_selectors import *
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


We will begin our model testing by splitting data into train and test parts.

In [29]:
df=pd.read_csv('../data/train.csv')
processing_pipeline = joblib.load('../pipelines/text_pipeline.joblib')

In [30]:
y = df['target']
X = df.drop(columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
#text preprocessing and new column adding pipeline
processing_pipeline.fit_transform(X_train)
processing_pipeline.transform(X_test)

Unnamed: 0,id,keyword,location,text,count_caps_lock,count_exclamation_mark,count_hashtags,count_words,count_punctuation,count_links,...,joy,positive,negative,disgust,anger,surprise,fear,processed_text,processed_text_str,mention_god_related
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,0,0,0,11,2,0,...,0,0,1,0,1,0,0,"[new, weapon, cause, unimaginable, destruction]",new weapon cause unimaginable destruction,0
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,2,0,1,21,10,0,...,0,0,1,0,0,1,1,"[famp, thing, gishwhes, get, soak, deluge, go,...",famp thing gishwhes get soak deluge go pad tam...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,2,0,0,15,13,1,...,0,1,0,0,0,1,1,"[dt, rt, ûïthe, col, police, catch, pickpocket...",dt rt ûïthe col police catch pickpocket liverp...,0
132,191,aftershock,,Aftershock back to school kick off was great. ...,1,0,0,21,3,0,...,0,0,1,0,1,0,0,"[aftershock, back, school, kick, great, want, ...",aftershock back school kick great want thank e...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0,0,0,17,3,0,...,0,1,1,0,0,0,1,"[response, trauma, child, addict, develop, def...",response trauma child addict develop defensive...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1835,2640,crashed,Somewhere,@SmusX16475 Skype just crashed u host,0,0,0,6,1,0,...,1,1,1,1,1,1,1,"[skype, crash, u, host]",skype crash u host,0
506,731,attacked,Arundel,Christian Attacked by Muslims at the Temple Mo...,0,0,0,18,9,1,...,1,1,1,1,1,1,1,"[christian, attack, muslim, temple, mount, wav...",christian attack muslim temple mount wave isra...,0
3592,5131,fatal,"New South Wales, Australia",Man charged over fatal crash near Dubbo refuse...,0,0,0,12,6,1,...,0,0,1,0,0,0,0,"[man, charge, fatal, crash, near, dubbo, refus...",man charge fatal crash near dubbo refuse bail via,0
6740,9657,thunderstorm,,#usNWSgov Severe Weather Statement issued Augu...,7,0,1,18,14,1,...,1,1,1,1,1,1,1,"[usnwsgov, severe, weather, statement, issue, ...",usnwsgov severe weather statement issue august...,0


# MODEL TESTING

We will test these models and vectorizers:
- Models:
    - Random Forest
    - SVM
    - Logistic Regression
    - Multinomial NB
- Vectorizers:
    - Count Vectorizer
    - CBow
    - Tfidf Vectorizer
    - Skipgram Vectorizer


In [32]:
class GensimVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_type='cbow', size=100, window=5, min_count=1, workers=4):
        self.model_type = model_type
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers

    def fit(self, X, y=None):
        model_type = 0 if self.model_type == 'cbow' else 1
        self.model = Word2Vec(X, vector_size=self.size, window=self.window, min_count=self.min_count, workers=self.workers, sg=model_type)
        return self

    def transform(self, X):
        return np.array([np.mean([self.model.wv[word] for word in sentence if word in self.model.wv] or [np.zeros(self.size)], axis=0) for sentence in X])

In [33]:
columns_to_remove = ['keyword', 'location', 'processed_text', 'text', 'emotions']
X_train = X_train.drop(columns=columns_to_remove)
X_test = X_test.drop(columns=columns_to_remove)

In [34]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1523 entries, 2644 to 1634
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1523 non-null   int64  
 1   count_caps_lock         1523 non-null   int64  
 2   count_exclamation_mark  1523 non-null   int64  
 3   count_hashtags          1523 non-null   int64  
 4   count_words             1523 non-null   int64  
 5   count_punctuation       1523 non-null   int64  
 6   count_links             1523 non-null   int64  
 7   count_stopwords         1523 non-null   int64  
 8   count_mentions          1523 non-null   int64  
 9   count_verbs             1523 non-null   int64  
 10  count_nouns             1523 non-null   int64  
 11  count_adjectives        1523 non-null   int64  
 12  count_adverbs           1523 non-null   int64  
 13  polarity                1523 non-null   float64
 14  subjectivity            1523 non-null   fl

In [35]:
#defining vectorizers and models we will test
vectorizers = {
    'Tfidf': TfidfVectorizer(),
    'Count': CountVectorizer(),
    'Skipgram': GensimVectorizer(model_type='skipgram'),
    'CBow': GensimVectorizer(model_type='cbow')
}

# Define models
models = {
    'MultinomialNB': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression()
    
}

# Define parameter grids for models
param_grid = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    'MultinomialNB': {
        'alpha': [0.01, 0.1, 1, 10, 100]
    }
}

# Vectorizer-specific parameters
vectorizer_params = {
    'max_features': [1000, 2000, 3000],
    'ngram_range': [(1, 1), (1, 2)]
}

In [36]:
results = []

# Assume X_train, X_test, y_train, y_test are already defined and preprocessed
for vec_name, vectorizer in vectorizers.items():
    if vec_name in ['Tfidf', 'Count']:
        for max_feat in vectorizer_params['max_features']:
            for ngram in vectorizer_params['ngram_range']:
                print(f"Running with {vec_name} vectorizer (max_features={max_feat}, ngram_range={ngram})")
                
                vectorizer.set_params(max_features=max_feat, ngram_range=ngram)
                X_train_transformed = vectorizer.fit_transform(X_train['processed_text_str'])
                X_test_transformed = vectorizer.transform(X_test['processed_text_str'])
                
                for model_name, model in models.items():
                    #print(f"Running Randomized Search for {model_name}...")
                    
                    params = param_grid[model_name]
                    
                    # Define the Randomized Search parameter distributions
                    param_dist = {key: [value] if isinstance(value, int) else value for key, value in params.items()}
                    
                    # Perform Randomized Search with reduced number of iterations and folds
                    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=3, scoring='f1', n_jobs=-1)
                    random_search.fit(X_train_transformed, y_train)
                    
                    # Get the best estimator
                    best_estimator = random_search.best_estimator_
                    
                    # Predict using the best estimator
                    #print("Predicting using the best estimator...")
                    predictions = best_estimator.predict(X_test_transformed)
                    
                    # Generate classification report
                    #print("Generating classification report...")
                    report = classification_report(y_test, predictions, output_dict=True)
                    
                    # Customizing the report format or content
                    custom_report = {
                        'Vectorizer': vec_name,
                        'Model': model_name,
                        'Max Features': max_feat,
                        'N-gram Range': ngram,
                        'Accuracy': report['accuracy'],
                        'Precision': report['weighted avg']['precision'],
                        'Recall': report['weighted avg']['recall'],
                        'F1-Score': report['weighted avg']['f1-score']
                    }
                    
                    results.append(custom_report)
                    #print("Completed.")
    else:
        print(f"Running with {vec_name} vectorizer...")
        X_train_transformed = vectorizer.fit_transform(X_train['processed_text_str'])
        X_test_transformed = vectorizer.transform(X_test['processed_text_str'])
        
        for model_name, model in models.items():
            if model_name == 'MultinomialNB':
                print(f"Skipping {model_name} with {vec_name} due to negative value constraints.")
                continue
            
            #print(f"Running Randomized Search for {model_name}...")
            
            params = param_grid[model_name]
            
            # Define the Randomized Search parameter distributions
            param_dist = {key: [value] if isinstance(value, int) else value for key, value in params.items()}
            
            # Perform Randomized Search with reduced number of iterations and folds
            random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=3, scoring='f1', n_jobs=-1)
            random_search.fit(X_train_transformed, y_train)
            
            # Get the best estimator
            best_estimator = random_search.best_estimator_
            print("Best parameters found: ", random_search.best_params_)
            
            # Predict using the best estimator
            #print("Predicting using the best estimator...")
            predictions = best_estimator.predict(X_test_transformed)
            
            # Generate classification report
            #print("Generating classification report...")
            report = classification_report(y_test, predictions, output_dict=True)
            
            # Customizing the report format or content
            custom_report = {
                'Vectorizer': vec_name,
                'Model': model_name,
                'Accuracy': report['accuracy'],
                'Precision': report['weighted avg']['precision'],
                'Recall': report['weighted avg']['recall'],
                'F1-Score': report['weighted avg']['f1-score']
            }
            
            results.append(custom_report)
            #print("Completed.")

Running with Tfidf vectorizer (max_features=1000, ngram_range=(1, 1))
Running with Tfidf vectorizer (max_features=1000, ngram_range=(1, 2))
Running with Tfidf vectorizer (max_features=2000, ngram_range=(1, 1))
Running with Tfidf vectorizer (max_features=2000, ngram_range=(1, 2))
Running with Tfidf vectorizer (max_features=3000, ngram_range=(1, 1))
Running with Tfidf vectorizer (max_features=3000, ngram_range=(1, 2))
Running with Count vectorizer (max_features=1000, ngram_range=(1, 1))
Running with Count vectorizer (max_features=1000, ngram_range=(1, 2))
Running with Count vectorizer (max_features=2000, ngram_range=(1, 1))
Running with Count vectorizer (max_features=2000, ngram_range=(1, 2))
Running with Count vectorizer (max_features=3000, ngram_range=(1, 1))
Running with Count vectorizer (max_features=3000, ngram_range=(1, 2))
Running with Skipgram vectorizer...
Skipping MultinomialNB with Skipgram due to negative value constraints.
Best parameters found:  {'n_estimators': 100, 'min_s

In [37]:
results_df1 = pd.DataFrame(results)
# import ace_tools as tools;tools.display_dataframe_to_user(name="Model Testing Results", dataframe=results_df)

sorted_df1 = results_df1.sort_values(by='F1-Score', ascending=False)

In [38]:
sorted_df1.head(10)

Unnamed: 0,Vectorizer,Model,Max Features,N-gram Range,Accuracy,Precision,Recall,F1-Score
43,Count,Logistic Regression,3000.0,"(1, 1)",0.801051,0.805342,0.801051,0.796819
42,Count,SVM,3000.0,"(1, 1)",0.800394,0.803895,0.800394,0.796463
40,Count,MultinomialNB,3000.0,"(1, 1)",0.797111,0.796543,0.797111,0.795593
14,Tfidf,SVM,2000.0,"(1, 2)",0.799081,0.801739,0.799081,0.795474
32,Count,MultinomialNB,2000.0,"(1, 1)",0.796454,0.795815,0.796454,0.795025
11,Tfidf,Logistic Regression,2000.0,"(1, 1)",0.795798,0.796464,0.795798,0.793152
35,Count,Logistic Regression,2000.0,"(1, 1)",0.795141,0.795224,0.795141,0.792896
19,Tfidf,Logistic Regression,3000.0,"(1, 1)",0.794485,0.795106,0.794485,0.791822
16,Tfidf,MultinomialNB,3000.0,"(1, 1)",0.795141,0.796936,0.795141,0.791805
20,Tfidf,MultinomialNB,3000.0,"(1, 2)",0.794485,0.796173,0.794485,0.79118


Let's see if adding additional features will improve the performance of the best scoring model

In [39]:
#pipeline to add new selected features
pipeline1 = Pipeline([('first_feature_selection',FirstFeatureSelector())])
X_train_1 = pipeline1.fit_transform(X_train)
X_test_1 = pipeline1.fit_transform(X_test)

In [46]:
from scipy.sparse import hstack, csr_matrix
vectorizer = CountVectorizer(max_features=3000, ngram_range=(1,1))
X_text = vectorizer.fit_transform(X_train_1['processed_text_str'])
X_text_test = vectorizer.transform(X_test_1['processed_text_str']) # the same transformations for text df
print(X_train_1.columns)
columns = ['count_caps_lock', 'count_exclamation_mark','positive', 'polarity', 'count_links', 'count_mentions', 'count_nouns']

additional_features = csr_matrix(X_train_1[columns].values)
additional_features_test = csr_matrix(X_test_1[columns].values)

X_combined = hstack([X_text, additional_features])
X_test_combined =hstack([X_text_test, additional_features_test])

params = param_grid["SVM"]
param_dist = {key: [value] if isinstance(value, int) else value for key, value in params.items()}
random_search = RandomizedSearchCV(SVC(), param_distributions=param_dist, n_iter=5, cv=3, scoring='f1', n_jobs=-1)
random_search.fit(X_combined, y_train)
    
best_estimator = random_search.best_estimator_
print(random_search.best_params_)


Index(['processed_text_str', 'count_caps_lock', 'count_exclamation_mark',
       'positive', 'polarity', 'count_links', 'count_mentions', 'count_nouns'],
      dtype='object')
{'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}


In [47]:
from sklearn.metrics import f1_score
predictions = best_estimator.predict(X_test_combined)
print("f1 score:", f1_score(predictions,y_test))

f1 score: 0.7517006802721088


F1 score after adding new features slightly dropped. It can be caused by other Randomized Search parameters but also new columns can act like a 'noise' for our model. 

As the performance didn't improve it is better to **keep the model simple**, so we will not add those columns.