<a href="https://colab.research.google.com/github/kamantina/projectreport/blob/main/feature_analysis/comment_features_pmi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Load the CSV file with comment data
df = pd.read_csv('/content/drive/MyDrive/CISC7298/comments_202050308_utf8_isEnglish2_2050.csv', encoding='utf-8')

# Ensure necessary columns exist
if 'videoId' not in df.columns or 'textOriginal' not in df.columns:
    raise ValueError("The required columns 'videoId' and 'textOriginal' are missing in your CSV.")

# Handle missing values
df['textOriginal'] = df['textOriginal'].fillna('').astype(str)

print("Data loaded successfully.")

Data loaded successfully.


In [None]:
df.shape

(461534, 10)

# preprocess

In [None]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import defaultdict
import contractions


# Download stopwords if you haven't already
nltk.download('stopwords')

def safe_expand_contractions(text):
    """
    Safely expand contractions in text using contractions.fix.
    If an IndexError (or any Exception) occurs, return the original text.
    """
    try:
        return contractions.fix(text)
    except IndexError as e:
        print(f"IndexError in contractions.fix for text: {text} - {e}")
        return text  # Fallback: return text unmodified
    except Exception as e:
        print(f"Error in contractions.fix for text: {text} - {e}")
        return text

def preprocess_text(text):
    """
    Preprocesses a text string by lowercasing, removing specific punctuation,
    and eliminating stopwords—all while keeping emojis intact.
    """
    # Handle missing or non-string input early
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        return []  # Return an empty list for empty or invalid inputs

    # Use the safe contraction expansion function
    text = safe_expand_contractions(text)

    # Lowercase the text
    text = str(text).lower()

    # Remove ASCII punctuation: this only removes, for instance, !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
    # Emojis (and other non-ASCII characters) are preserved.
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Simple whitespace tokenization
    tokens = text.split()

    # Remove stopwords from the token list
    stops = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stops]

    return tokens

# Preprocess the original comment text and store tokens in a new column
df['tokens'] = df['textOriginal'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


IndexError in contractions.fix for text: VERY SATİSFYİNG AND COMFORTİNG KİNETİC SAND ASMR # 5 : https://youtu.be/xOvJf_RsUYw - string index out of range


# PMI

Extract Candidate Terms

In [None]:
# Create a dictionary that maps each candidate term (unigram, bigram, and trigram) to a set of comment IDs (ensuring each comment is counted only once per candidate).
candidate_dict = defaultdict(set)

def add_candidates(comment_id, tokens):
    """
    For a given comment (by comment_id) and its tokenized words,
    add unigrams, bigrams, and trigrams (unique within the comment)
    to the candidate dictionary.
    """
    # Unigrams
    unique_tokens = set(tokens)
    for token in unique_tokens:
        candidate_dict[token].add(comment_id)

    # Bigrams (ordered pairs)
    unique_bigrams = set(ngrams(tokens, 2))
    for bg in unique_bigrams:
        candidate = " ".join(bg)
        candidate_dict[candidate].add(comment_id)

    # Trigrams
    unique_trigrams = set(ngrams(tokens, 3))
    for tg in unique_trigrams:
        candidate = " ".join(tg)
        candidate_dict[candidate].add(comment_id)

# Process each comment (using the commentId as a unique identifier)
for idx, row in df.iterrows():
    comment_id = row['commentId']
    tokens = row['tokens']
    add_candidates(comment_id, tokens)

total_comments = len(df)
print(f"Total comments: {total_comments}")


Total comments: 461534


Filter Candidate Terms by Frequency Criteria

In [None]:
# Candidate terms to be kept:
#   - Appear in at least 200 comments (to avoid rare words)
#   - Appear in no more than 10% of all comments (to filter out overly common words)
min_comments = 200
max_comments = total_comments / 10

filtered_candidates = {
    term: len(comment_ids)
    for term, comment_ids in candidate_dict.items()
    if min_comments <= len(comment_ids) <= max_comments
}

print(f"Total candidate terms after frequency filtering: {len(filtered_candidates)}")


Total candidate terms after frequency filtering: 2858


Compute PMI for Each Candidate Relative to Keywords

In [None]:
# Define the keywords for which we want to generate associated word lists.
# (They should already be lowercase from preprocessing.)
keywords = ['focus', 'relax', 'sleep']

# Initialize a dictionary to store PMI scores for each keyword
pmi_scores = {kw: {} for kw in keywords}


# Here count(k) and count(w) are the number of unique comments in which the keyword or candidate appears, and count(k and w) is the number of comments where both occur.
for kw in keywords:
    if kw not in candidate_dict:
        print(f"Warning: keyword '{kw}' not found in the data.")
        continue
    kw_count = len(candidate_dict[kw])
    for term, term_count in filtered_candidates.items():
        # Calculate co-occurrence: number of comments containing both the keyword and the candidate term.
        # (Both are stored as sets of comment IDs so we take the intersection.)
        if term in candidate_dict:
            co_occurrence = len(candidate_dict[kw].intersection(candidate_dict[term]))
        else:
            co_occurrence = 0

        if co_occurrence > 0:
            pmi = np.log((co_occurrence * total_comments) / (kw_count * term_count))
            pmi_scores[kw][term] = pmi


Selecting the Top n% Candidate Terms by PMI for Each Keyword

In [None]:
top_candidate_terms = {}
for kw, scores in pmi_scores.items():
    if not scores:
        continue
    # Sort candidate terms for the keyword by PMI in descending order
    sorted_terms = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    # Take the top 4% of these terms
    top_n = max(1, int(0.04 * len(sorted_terms)))  # Ensure at least one term is taken
    top_candidate_terms[kw] = sorted_terms[:top_n]

# # Display the top candidate terms for each keyword along with their PMI scores
# for kw, terms in top_candidate_terms.items():
#     print(f"\nTop candidate terms for keyword '{kw}':")
#     for term, score in terms:
#         print(f"{term}: {score:.4f}")

In [None]:
import csv

# Save to CSV with one row per keyword-term-score combination
with open('top_candidate_terms.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Keyword', 'Term', 'PMI_Score'])  # Header row

    for kw, terms in top_candidate_terms.items():
        for term, score in terms:
            writer.writerow([kw, term, f"{score:.4f}"])

Calculate feature percentage of each video

In [None]:
# Utility function to count candidate term occurrences in token lists.
# This function handles both unigrams and multi-word phrases.
def count_candidate_occurrences(tokens, candidate_terms):
    count = 0
    # Join tokens into a string (with spaces) for simpler phrase matching.
    joined_tokens = " ".join(tokens)

    for term in candidate_terms:
        term_tokens = term.split()
        if len(term_tokens) == 1:
            # For single words: count occurrences in the token list.
            count += tokens.count(term)
        else:
            # For phrases: count occurrences in the joined tokens.
            # This simple method may count overlapping occurrences.
            count += joined_tokens.count(term)
    return count

# Function to compute the "feature percentage" from candidate terms in a list of comments.
# Now each comment is already tokenized, so we use the tokens directly.
def compute_feature_percentage(video_tokens_list, candidate_terms):
    total_token_count = 0
    candidate_token_count = 0

    # video_tokens_list is a list where each element is a list of tokens from one comment.
    for tokens in video_tokens_list:
        total_token_count += len(tokens)
        candidate_token_count += count_candidate_occurrences(tokens, candidate_terms)

    # Return the fraction of candidate term occurrences (multiply by 100 if you want a percent)
    return candidate_token_count / total_token_count if total_token_count > 0 else 0

# For each keyword, extract just the candidate terms.
topic_terms = {}
for keyword, term_list in top_candidate_terms.items():
    topic_terms[keyword] = [term for term, score in term_list]


video_groups = df.groupby('videoId')['tokens'].apply(list)
# Now, video_groups is a Series with each value being a list of token lists (one per comment).

# Compute the feature percentage for each video for each topic keyword.
video_feature_percentages = {}

for keyword, candidate_list in topic_terms.items():
    video_feature_percentages[keyword] = {}
    for videoId, tokens_list in video_groups.items():
        percentage = compute_feature_percentage(tokens_list, candidate_list)
        video_feature_percentages[keyword][videoId] = percentage



In [None]:
# Convert the dictionary into a DataFrame.
# The outer dictionary keys become column names, and the inner dictionary keys become the index.
features_df = pd.DataFrame(video_feature_percentages)

# Set the index name to 'videoId' and then reset the index so that 'videoId' becomes a column.
features_df.index.name = "videoId"
features_df.reset_index(inplace=True)

features_df.head()

In [None]:
# Load the video data CSV file
df_video = pd.read_csv('/content/drive/MyDrive/CISC7298/videos_202500308_utf8_filtered_isEnglish2_2050_LIWC.csv', encoding='utf-8')

In [None]:
# Merge features_df into videos_df on the column 'videoId'
df_video = df_video.merge(features_df, on="videoId", how="left")

df_video.head()

In [None]:
# Save the video data file with features on focus, relax and sleep
df_video.to_csv('videos_202500308_utf8_filtered_isEnglish2_2050_LIWC_pmi200-10-4.csv', index=False)