In [None]:
# 1. Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import nltk 
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk import download
# If necessary, download the required NLTK packages --> 
nltk.download('punkt')

In [2]:
# 2. Load CSV data
df = pd.read_csv('cleaned_youtube_comments.csv')


In [3]:
# 3. Function to generate n-grams
def generate_ngrams(text, n):
    if pd.isna(text) or len(text.strip()) == 0:
        return []  # Return empty list for empty comments
    words = word_tokenize(text)
    return list(ngrams(words, n))


In [5]:
# This will generate lists of n-grams for each comment
df['2-grams'] = df['comment'].apply(lambda x: generate_ngrams(x, 2))
df['3-grams'] = df['comment'].apply(lambda x: generate_ngrams(x, 3))
df['4-grams'] = df['comment'].apply(lambda x: generate_ngrams(x, 4))


In [6]:
# Explode the lists of n-grams into individual rows for counting
all_2grams = df['2-grams'].explode().dropna()
all_3grams = df['3-grams'].explode().dropna()
all_4grams = df['4-grams'].explode().dropna()

In [7]:
# Convert n-grams from tuple to string format (so they can be counted)
all_2grams = all_2grams.apply(lambda x: ' '.join(x))
all_3grams = all_3grams.apply(lambda x: ' '.join(x))
all_4grams = all_4grams.apply(lambda x: ' '.join(x))

In [None]:
# Check the first few n-grams to ensure they're correct
print("First 5 2-grams:")
print(all_2grams.head())
print("First 5 3-grams:")
print(all_3grams.head())
print("First 5 4-grams:")
print(all_4grams.head())

In [9]:
# Count the most common n-grams
counter_2grams = Counter(all_2grams)
counter_3grams = Counter(all_3grams)
counter_4grams = Counter(all_4grams)

In [10]:
# Create a DataFrame from the counter objects
df_2grams = pd.DataFrame(counter_2grams.most_common(100), columns=['2-gram', 'Frequency'])
df_3grams = pd.DataFrame(counter_3grams.most_common(100), columns=['3-gram', 'Frequency'])
df_4grams = pd.DataFrame(counter_4grams.most_common(100), columns=['4-gram', 'Frequency'])

# Optionally, save the DataFrames as CSVs
df_2grams.to_csv('top_2grams.csv', index=False)
df_3grams.to_csv('top_3grams.csv', index=False)
df_4grams.to_csv('top_4grams.csv', index=False)

In [None]:
# Check if the Counter is working properly
print("Most common 2-grams:")
print(counter_2grams.most_common(10))

print("Most common 3-grams:")
print(counter_3grams.most_common(10))

print("Most common 4-grams:")
print(counter_4grams.most_common(10))

In [None]:
# Save results to a new CSV
df.to_csv('n-gram_youtube_comments.csv', index=False)


In [23]:
# Function to count words in a comment
def count_words(text):
    if pd.isna(text) or len(text.strip()) == 0:
        return 0  # Return 0 if comment is empty or NaN
    words = word_tokenize(text)
    return len(words)

# Apply the function to the 'comment' column to count words for each comment
df['word_count'] = df['comment'].apply(count_words)

# Get the total word count across all comments
total_word_count = df['word_count'].sum()
print(f'Total number of words across all comments: {total_word_count}')

Total number of words across all comments: 1249752
