In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
import os

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))
from lib.preprocessing_data import Preprocessing
from lib.data_preparation import DataPreparation
from sklearn.model_selection import train_test_split
 
import pandas as pd

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(df, title):
    sampled_df = df.sample(frac=0.005, axis=0).sample(frac=0.02, axis=1)
    
    
    plt.figure(figsize=(15, 8), facecolor='lightgray')
    sns.heatmap(sampled_df, cmap="YlGnBu", annot=False, cbar=True)
    plt.title(title)
    plt.xlabel("Features (Words)")
    plt.ylabel("Documents")
    plt.show()

In [15]:
def preprocess_and_vectorize_tweets(preprocessing_name, preprocessing_method,vectorizer_name, vectorizer, df, data_type):
    data_set = preprocessing_method(df)
    
    processed_tweets = data_set['processed_tweets']
    processed_tweets = processed_tweets.apply(lambda row: ' '.join(row))
    
    if data_type == 'train':
        vectorizer.fit_transform(processed_tweets)
    else :
        vectorizer.transform(processed_tweets)
    
    
    transformed_output = vectorizer.transform(processed_tweets)
    feature_names = vectorizer.get_feature_names_out()
    dense_output = transformed_output.todense()
    train_df = pd.DataFrame(
        dense_output, 
        columns=feature_names,
        index=processed_tweets.index 
    )
    
    
    
    train_df = Preprocessing().remove_duplicates(train_df)
    
    train_df.to_csv(
        f'../data/prepocessed_{data_type}ing_data/{preprocessing_name}_{vectorizer_name}.csv',
        index=False
        )
    

In [16]:
data_set = DataPreparation().load_data()


preprocessing_methods = {
    "tokenization": Preprocessing().tokenization,
    "lemmatization": Preprocessing().lemmatization,
    "stemming": Preprocessing().stemming,
    "stemming_with_misspelling": Preprocessing().stemming_with_misspelling,
    "lemmatization_with_misspelling": Preprocessing().lemmatization_with_misspelling,
    "lemmatization_with_stop-words_removal": Preprocessing().lemmatization_with_stopwords_removal,
}
vectorizers = {
    
    "Bow": CountVectorizer(binary=False),
    "TF-IDF": TfidfVectorizer(),
    "BinaryVectorizer": CountVectorizer(binary=True),
}

train_df, test_df = train_test_split(
            data_set, test_size=0.2, random_state=1, stratify=data_set['label']
        )

train_df.to_csv('../data/raw_splits/train.csv', index=False)
test_df.to_csv('../data/raw_splits/test.csv', index=False)



for preprocessing_name, preprocessing_method in preprocessing_methods.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        preprocess_and_vectorize_tweets(preprocessing_name=preprocessing_name,
                                        preprocessing_method=preprocessing_method,
                                        vectorizer_name=vectorizer_name,
                                        vectorizer=vectorizer,
                                        df=train_df, 
                                        data_type='train'
                                        )
        preprocess_and_vectorize_tweets(preprocessing_name=preprocessing_name,
                                        preprocessing_method=preprocessing_method,
                                        vectorizer_name=vectorizer_name,
                                        vectorizer=vectorizer,
                                        df=test_df, 
                                        data_type='test')



KeyboardInterrupt: 