In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))
from lib.preprocessing_data import Preprocessing
from lib.data_preparation import DataPreparation
from sklearn.model_selection import train_test_split
 
import pandas as pd

In [32]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(df, title):
    sampled_df = df.sample(frac=0.005, axis=0).sample(frac=0.02, axis=1)
    
    
    plt.figure(figsize=(15, 8), facecolor='lightgray')
    sns.heatmap(sampled_df, cmap="YlGnBu", annot=False, cbar=True)
    plt.title(title)
    plt.xlabel("Features (Words)")
    plt.ylabel("Documents")
    plt.show()

In [33]:
data_set = DataPreparation().load_data()


preprocessing_methods = {
    "lemmatization": Preprocessing().lemmatization,
    "tokenization": Preprocessing().tokenization,
    "stemming": Preprocessing().stemming,
    "stemming + misspelling": Preprocessing().stemming_with_misspelling,
    "lemmatization + misspelling": Preprocessing().lemmatization_with_misspelling,
    # "stop-words removal": Preprocessing().remove_stop_words,
}
vectorizers = {
    
    "TF-IDF": TfidfVectorizer(),
    "Bag of words": CountVectorizer(binary=False),
    "o or 1 if word exists": CountVectorizer(binary=True),
}

train_df, test_df = train_test_split(
            data_set, test_size=0.2, random_state=1, stratify=data_set['label']
        )


for preprocessing_name, preprocessing_methods in preprocessing_methods.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        data_set_processed = preprocessing_methods(data_set)
        processed_tweets = data_set_processed['processed_tweets']
        processed_tweets = processed_tweets.apply(lambda row: ' '.join(row))
        
        train_vector = vectorizer.fit_transform(processed_tweets)
        transformed_output = vectorizer.transform(processed_tweets)
        feature_names = vectorizer.get_feature_names_out()
        dense_output = transformed_output.todense()
        train_df = pd.DataFrame(
            dense_output, 
            columns=feature_names,
            index=processed_tweets.index 
        )
        train_df.to_csv(
            f'../preprocessed_data/{preprocessing_name}_{vectorizer_name}.csv',
            index=False
            )

KeyboardInterrupt: 