In [None]:
# required libraries
import pandas as pd
import matplotlib.pyplot as plt
import nltk 
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk import download
# If necessary, download the required NLTK packages --> 
nltk.download('punkt')
from collections import Counter
from nltk.corpus import stopwords
from nltk import ngrams, word_tokenize


In [None]:
# Load CSV data
df = pd.read_csv('classified_comments.csv')


In [None]:
# Function to generate n-grams
# Build your stopword set (English, Spanish, Russian, + custom)
stop_words_en = set(stopwords.words('english'))
stop_words_es = set(stopwords.words('spanish'))
stop_words_ru = set([
    'и','в','во','не','что','он','на','я','с','со','как','а','то',
    'все','она','так','его','но','да','ты','к','у','же','вы','за',
    'бы','по','ее','мне','было','вот','от','меня','еще','нет','о',
    'из','ему','теперь','когда','даже','ну','вдруг','ли','если',
    'уже','или','ни','быть','был','него','до','вас','нибудь',
    'опять','уж','вам','ведь','там','потом','себя','ничего','ей',
    'может','они','тут','где','есть','надо','ней','для','мы',
    'тебя','их','чем','была','сам','чтоб','без','будто','чего',
    'раз','тоже','себе','под','будет','ж','тогда','кто','этот',
    'того','потому','этого','какой','совсем','ним','здесь',
    'этом','один','почти','мой','тем','чтобы','нее','сейчас',
    'были','куда','зачем','всех','никогда','можно','при','наконец',
    'два','об','другой','хоть','после','над','больше','тот',
    'через','эти','нас','про','всего','них','какая','много',
    'разве','три','эту','моя','впрочем','хорошо','свою','этой',
    'перед','иногда','лучше','чуть','том','нельзя','такой',
    'им','более','всегда','конечно','всю','между'
])
custom_stopwords = set([
        'game', 'video', 'youtube', 'comment', 'player', "yeah", "im", "na", "yet", "a", "one", "oh", "isnt", "didnt",
        'games', 'play', 'like', "u", "c", "jim", "stopvivekbindra"
    ])  

stop_words = stop_words_en | stop_words_es | stop_words_ru | custom_stopwords

# Updated n-gram function
def generate_ngrams(text, n):
    if pd.isna(text) or len(text.strip()) == 0:
        return []  # Handle empty comments
    
    # Tokenize and lowercase
    words = word_tokenize(text.lower())
    
    # Filter stopwords and non-alphabetic tokens
    words = [w for w in words if w.isalpha() and w not in stop_words]
    
    # Generate n-grams
    return list(ngrams(words, n))

# This will generate lists of n-grams for each comment
df['2-grams'] = df['comment'].apply(lambda x: generate_ngrams(x, 2))
df['3-grams'] = df['comment'].apply(lambda x: generate_ngrams(x, 3))
df['4-grams'] = df['comment'].apply(lambda x: generate_ngrams(x, 4))

# Explode the lists of n-grams into individual rows for counting
all_2grams = df['2-grams'].explode().dropna()
all_3grams = df['3-grams'].explode().dropna()
all_4grams = df['4-grams'].explode().dropna()
# Convert n-grams from tuple to string format (so they can be counted)
all_2grams = all_2grams.apply(lambda x: ' '.join(x))
all_3grams = all_3grams.apply(lambda x: ' '.join(x))
all_4grams = all_4grams.apply(lambda x: ' '.join(x))

In [20]:
# Count the most common n-grams
counter_2grams = Counter(all_2grams)
counter_3grams = Counter(all_3grams)
counter_4grams = Counter(all_4grams)

In [21]:
# Create a DataFrame from the counter objects
df_2grams = pd.DataFrame(counter_2grams.most_common(100), columns=['2-gram', 'Frequency'])
df_3grams = pd.DataFrame(counter_3grams.most_common(100), columns=['3-gram', 'Frequency'])
df_4grams = pd.DataFrame(counter_4grams.most_common(100), columns=['4-gram', 'Frequency'])

# Optionally, save the DataFrames as CSVs
df_2grams.to_csv('top_2grams.csv', index=False)
df_3grams.to_csv('top_3grams.csv', index=False)
df_4grams.to_csv('top_4grams.csv', index=False)

In [None]:

import matplotlib.pyplot as plt

def make_table_image(df, title, filename):
    fig, ax = plt.subplots(figsize=(6, len(df)*0.5))  # Auto-scale height
    ax.axis('off')
    table = ax.table(
        cellText=df.values,
        colLabels=df.columns,
        cellLoc='center',
        loc='center'
    )
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.5)
    plt.title(title, fontsize=14, pad=10)
    plt.tight_layout()
    plt.savefig(filename, bbox_inches='tight')
    plt.show()

# Example for top 10 n-grams:
make_table_image(df_2grams.head(10), "Top 2-grams", "top_2grams.png")
make_table_image(df_3grams.head(10), "Top 3-grams", "top_3grams.png")
make_table_image(df_4grams.head(10), "Top 4-grams", "top_4grams.png")


In [23]:
# Save results to a new CSV
df.to_csv('n-gram_youtube_comments.csv', index=False)


In [None]:
# Function to count words in a comment
def count_words(text):
    if pd.isna(text) or len(text.strip()) == 0:
        return 0  # Return 0 if comment is empty or NaN
    words = word_tokenize(text)
    return len(words)

df['word_count'] = df['comment'].apply(count_words)

# Get the total word count across all comments
total_word_count = df['word_count'].sum()
print(f'Total number of words across all comments: {total_word_count}')

Total number of words across all comments: 1249752
