In [77]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re

from utils.system import *

In [78]:
def get_top_ngrams(corpus, n=None, ngram_range=(2, 2)):
    vec = CountVectorizer(ngram_range=ngram_range, stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    
    # Get feature names and their corresponding sum counts
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    
    # Sort words by frequency in descending order
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    return words_freq[:n]

In [79]:
data = pd.read_parquet(get_data() / 'clean_data.parquet.brotli')

In [80]:
lonely = data.loc[data['overall_label'] == 1]

In [81]:
top_unigrams = get_top_ngrams(lonely['cleaned_article'], n=20, ngram_range=(1, 1))
top_bigrams = get_top_ngrams(lonely['cleaned_article'], n=20, ngram_range=(2, 2))
top_trigrams = get_top_ngrams(lonely['cleaned_article'], n=20, ngram_range=(3, 3))

In [71]:
print("Top 20 Bigrams: ", top_bigrams)

Top 20 Bigrams:  [('feel like', 1143), ('don know', 867), ('just want', 456), ('don want', 399), ('feel lonely', 292), ('just feel', 245), ('make friends', 240), ('feels like', 198), ('best friend', 196), ('don think', 196), ('high school', 195), ('don really', 189), ('just don', 181), ('don feel', 157), ('want talk', 147), ('feeling lonely', 135), ('makes feel', 129), ('friends don', 128), ('like just', 123), ('just wish', 115)]


In [72]:
print("Top 20 Trigrams: ", top_trigrams)

Top 20 Trigrams:  [('just feel like', 65), ('don know anymore', 63), ('just don know', 62), ('don feel like', 56), ('feel like just', 51), ('make new friends', 43), ('don really know', 42), ('feel like don', 41), ('feel like ve', 40), ('meet new people', 37), ('just want talk', 36), ('don know just', 33), ('don know feel', 31), ('just want feel', 29), ('just feel lonely', 29), ('makes feel like', 27), ('friends feel like', 26), ('play video games', 26), ('feel like going', 25), ('just feels like', 25)]


In [73]:
# Assuming top_bigrams and top_trigrams have been correctly extracted
top_bigrams = [pair[0] for pair in top_bigrams]
top_trigrams = [triple[0] for triple in top_trigrams]

# Combine bigrams and trigrams
top_ngrams = top_bigrams + top_trigrams

# Create a new vectorizer that can capture both bigrams and trigrams
# Note: We do not restrict the vocabulary this time
vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')

# Fit and transform your data using the new vectorizer
X = vectorizer.fit_transform(data['cleaned_article'])

# Filter out only the columns corresponding to the top N-grams
filtered_columns = [col for col in vectorizer.get_feature_names_out() if col in top_ngrams]
filtered_X = X[:, [vectorizer.vocabulary_[col] for col in filtered_columns]]

# Create a DataFrame from the filtered feature matrix
ngram_features = pd.DataFrame(filtered_X.toarray(), columns=filtered_columns)
new_column_names = [f'n_gram_{i+1}' for i in range(ngram_features.shape[1])]
ngram_features.columns = new_column_names
ngram_features.index = data.index

In [74]:
ngram_features.to_parquet(get_data() / 'n_gram.parquet.brotli', compression='brotli')