In [None]:
##### PARAGRAM ANALYSIS #####
# Step-by-Step Guide for Paragram Analysis
# We will break this into manageable steps:
# Preprocessing the Comments
# Training or Using Pretrained Word2Vec / GloVe Embeddings
# Semantic Similarity Calculation (Comparing Sentences)
# Clustering or Grouping Similar Comments
# Visualizing or Analyzing Results

In [3]:
import pandas as pd
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
# Load your cleaned dataset (assuming the file is named 'cleaned_youtube_comments.csv')
df = pd.read_csv('cleaned_youtube_comments.csv')
# Tokenize the comments in the 'comment' column (assuming they are already cleaned)
df['tokenized_comments'] = df['comment'].apply(lambda x: nltk.word_tokenize(x))  # Tokenizing without converting to lowercase

# Train the Word2Vec model using the tokenized comments
model = Word2Vec(sentences=df['tokenized_comments'], vector_size=200, window=10, min_count=5, workers=4)

# Save the trained model for later use
model.save("word2vec_model")


[nltk_data] Downloading package punkt to /Users/psylviana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Set Pandas display options to avoid truncating text in columns
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Avoid truncating text
pd.set_option('display.width', 1000)        # Increase the total display width

In [4]:
import numpy as np
# Load the previously trained Word2Vec model
model = Word2Vec.load("word2vec_model")

# Function to get the vector representation of a comment
def get_comment_vector(comment, model):
    # Get the word vectors for each word in the comment
    vectors = [model.wv[word] for word in comment if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)  # Average the word vectors
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no word is in the model

# Convert each tokenized comment into its vector representation
df['comment_vector'] = df['tokenized_comments'].apply(lambda x: get_comment_vector(x, model))


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Sample a smaller subset (e.g., 100 comments) to avoid memory overload
sample_df = df.sample(n=100, random_state=42)  # Adjust n as needed

# Compute pairwise cosine similarity for the smaller subset
similarity_matrix = cosine_similarity(list(sample_df['comment_vector']))

# Example: Get similarity between the first comment and all other comments in the sample
print(similarity_matrix[0])


In [8]:
from sklearn.cluster import KMeans
import pandas as pd

# Sample a subset (e.g., 100 comments) for faster clustering
sample_df = df.sample(n=100, random_state=42)  # Adjust n as needed

# Choose the number of clusters (adjust as needed based on dataset)
num_clusters = 5  # For example, adjust based on your data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Apply clustering to the comment vectors (from the sampled data)
sample_df['cluster'] = kmeans.fit_predict(list(sample_df['comment_vector']))

# Show the cluster assignments for the first few comments
print(sample_df[['comment', 'cluster']].head())



                                                                                                                comment  cluster
32066  i waited for the day this video would pop up in my sub feed and yet it took me by surprise im so happy right now        3
10203                                                                                                               036        1
6395                                                                after 10 years this will become old so enjoy it now        3
5665                                                                                                 137 hits hard geez        1
27419                                                                                                    jumpscare 2822        1


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Step 1: Apply KMeans clustering to your comment vectors (make sure it's run)
# First, ensure you have 'comment_vector' column created (e.g., from Word2Vec embeddings)

# Choose the number of clusters (adjust based on your dataset)
num_clusters = 8  # Adjust this based on your data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Apply KMeans clustering to the comment vectors
df['cluster'] = kmeans.fit_predict(list(df['comment_vector']))

# Step 2: Reduce dimensionality using PCA for visualization
# Optionally, sample a smaller subset of data (to avoid memory issues)
sample_df = df.sample(n=1000, random_state=42)  # Adjust n as needed

# PCA for dimensionality reduction (reduce to 2D for visualization)
pca = PCA(n_components=2)
pca_results = pca.fit_transform(list(sample_df['comment_vector']))

# Step 3: Create a scatter plot of the PCA results, colored by cluster
plt.figure(figsize=(10, 8))
plt.scatter(pca_results[:, 0], pca_results[:, 1], c=sample_df['cluster'], cmap='viridis')
plt.colorbar()
plt.title("PCA Visualization of Comment Clusters")
plt.show()


In [16]:
# Ensure that the comment_vector column is created before clustering
# Assuming 'comment_vector' has already been created using Word2Vec or other method

# Choose the number of clusters (adjust as needed based on your data)
num_clusters = 8  # You can adjust this value

# Apply KMeans clustering to the comment vectors
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(list(df['comment_vector']))

# Create a new DataFrame with comments and their corresponding cluster labels
cluster_table = df[['comment', 'cluster']]

# Display the cluster table (first 10 rows for preview)
print(cluster_table.head(20))

                                                                                                                                                                             comment  cluster
0                                 if you enjoy gamespub videos a comment like or sub would be highly appreciated it means a lot for us you can also follow us on twitter for updates        5
1                                                                                                                                                                obrigado pela ajuda        2
2   i am german and i hated franz kafkas books at school in my childhood when you are 12 years old you really dont get whats going on but seems german education system wants you to        5
3                                                                                           thank you for your help on the ammo puzzle honestly didnt know where to go with that one        5
4                                                 