In [1]:
# Standard Library Imports
import pandas as pd
import re
import time
import datetime
from collections import defaultdict, Counter

# Third-party Library Imports
import requests
from bs4 import BeautifulSoup
import praw
import nltk
import numpy as np
import matplotlib.pyplot as plt
import concurrent.futures
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import bigrams
from sklearn.feature_extraction.text import CountVectorizer
import spacy

In [2]:
# Set pandas display options to show the entire content of the "Post Text" column
pd.set_option('display.max_colwidth', None)

In [3]:
# Specify the path to your CSV file
csv_file_path = 'reddits.csv'

# Read the CSV file into a DataFrame
df_reddit_post = pd.read_csv(csv_file_path)

# Now, 'df' contains your data from the CSV file

len(df_reddit_post)

2489

In [4]:
# Define a function to check if a post is an image-only post
def is_image_only_post(post):
    # Check if the post's content (Post Text) is empty (no text) and the URL is an image or an image-hosting site
    return not post["Post Text"] and post["Post URL"].endswith(('.jpg', '.jpeg', '.png', '.gif', '.gifv', '.webp'))

# Apply the function to the DataFrame to create a Boolean mask
image_only_mask = df_reddit_post.apply(is_image_only_post, axis=1)

# Filter out image-only posts
filtered_df_reddit_post = df_reddit_post[~image_only_mask]
print(f"Total number of posts after filter collected: {len(filtered_df_reddit_post)}")

KeyError: 'Post Text'

In [None]:
# Create a new DataFrame with the selected columns
selected_columns = ["Title", "Post Text", "Subreddit"]
filtered_columns_df_reddit_post = filtered_df_reddit_post[selected_columns]
# Save the data to a CSV file
filtered_columns_df_reddit_post.to_csv("reddit_posts (no image-only post).csv", index=False)
filtered_columns_df_reddit_post.head()

In [None]:
# Iterate through all columns and convert text data to lowercase
for column in filtered_columns_df_reddit_post.columns:
    if filtered_columns_df_reddit_post[column].dtype == 'object':
        filtered_columns_df_reddit_post[column] = filtered_columns_df_reddit_post[column].str.lower()

# Print the resulting DataFrame
filtered_columns_df_reddit_post

In [None]:
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

In [None]:
# Get the list of English stopwords
stop_words = set(stopwords.words("english"))

# Function to filter out stopwords
def remove_stopwords(text):
    if isinstance(text, str):  # Check if the value is a string (not NaN)
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return " ".join(filtered_words)
    else:
        return text  # Return the original value if it's NaN

# Apply the function to the "Title" and "Post Text" columns using .loc
filtered_columns_df_reddit_post["Stopword Dropped Title"] = filtered_columns_df_reddit_post["Title"].apply(remove_stopwords)
filtered_columns_df_reddit_post["Stopword Dropped Post Text"] = filtered_columns_df_reddit_post["Post Text"].apply(remove_stopwords)

# Create a clean copy of the DataFrame with the dropped columns
dropped_filtered_columns_dataframe = filtered_columns_df_reddit_post.drop(columns=["Title", "Post Text"]).copy()

# Display the DataFrame with the dropped columns
dropped_filtered_columns_dataframe

In [None]:
# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Create a function to process and modify text
def process_text(text):
    if isinstance(text, str):
        doc = nlp(text)
        modified_tokens = []
        for token in doc:
            if token.text.lower() == "i'm":
                modified_tokens.extend(["i", "am"])
            elif token.text.lower() == "emma's":
                modified_tokens.append("emma")
            else:
                modified_tokens.append(token.text)
        return " ".join(modified_tokens)
    else:
        return text

# Loop through all columns and apply the processing function
for column in dropped_filtered_columns_dataframe.columns:
    if dropped_filtered_columns_dataframe[column].dtype == 'object':
        dropped_filtered_columns_dataframe[column] = dropped_filtered_columns_dataframe[column].apply(process_text)

# Print the resulting DataFrame
dropped_filtered_columns_dataframe

In [None]:
# Remove punctuation from specific columns
columns_to_clean = ['Stopword Dropped Post Text', 'Stopword Dropped Title']

for column in columns_to_clean:
    dropped_filtered_columns_dataframe[column] = dropped_filtered_columns_dataframe[column].str.replace(r'[^\w\s]', '', regex=True)

# Display the DataFrame
dropped_filtered_columns_dataframe

In [None]:
#This is to split the content of each title and post into their unigrams and bigrams

# Initialize the Porter stemmer
stemmer = PorterStemmer()

# Initialize the Porter stemmer
stemmer = PorterStemmer()

# Function to stem text
def stem_text(text):
    if isinstance(text, str):  # Check if text is a string
        words = text.split()
        stemmed_words = [stemmer.stem(word) for word in words]
        return " ".join(stemmed_words)
    else:
        return text  # Return the original value if it's not a string

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    if isinstance(text, str):  # Check if text is a string
        words = text.split()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        return " ".join(lemmatized_words)
    else:
        return text  # Return the original value if it's not a string

# Function to split text into unigrams and bigrams
def split_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Tokenize the text into words
        words = text.split()
        
        # Create unigrams (single words)
        unigrams = words
        
        # Create bigrams (consecutive word pairs)
        bigrams = list(zip(words, words[1:]))
        
        return unigrams, bigrams
    else:
        return [], []  # Return empty lists for unigrams and bigrams if text is not a string

# Apply stemming, lemmatization, and generate unigrams and bigrams to your columns
dropped_filtered_columns_dataframe["Stemmed Title"] = dropped_filtered_columns_dataframe["Stopword Dropped Title"].apply(stem_text)
dropped_filtered_columns_dataframe["Stemmed Post Text"] = dropped_filtered_columns_dataframe["Stopword Dropped Post Text"].apply(stem_text)
dropped_filtered_columns_dataframe["Lemmatized Title"] = dropped_filtered_columns_dataframe["Stopword Dropped Title"].apply(lemmatize_text)
dropped_filtered_columns_dataframe["Lemmatized Post Text"] = dropped_filtered_columns_dataframe["Stopword Dropped Post Text"].apply(lemmatize_text)
dropped_filtered_columns_dataframe[["Unigrams Title", "Bigrams Title"]] = dropped_filtered_columns_dataframe["Stopword Dropped Title"].apply(split_text).apply(pd.Series)
dropped_filtered_columns_dataframe[["Unigrams Post Text", "Bigrams Post Text"]] = dropped_filtered_columns_dataframe["Stopword Dropped Post Text"].apply(split_text).apply(pd.Series)

# Drop the original columns
dropped_filtered_columns_dataframe = dropped_filtered_columns_dataframe.drop(columns=["Stopword Dropped Title", "Stopword Dropped Post Text"])

# Convert all text columns to lowercase
text_columns = ["Stemmed Title", "Stemmed Post Text", "Lemmatized Title", "Lemmatized Post Text"]
for column in text_columns:
    dropped_filtered_columns_dataframe[column] = dropped_filtered_columns_dataframe[column].str.lower()

# Display the updated DataFrame
dropped_filtered_columns_dataframe

In [None]:
# Combine the text from both "Unigrams Title" and "Unigrams Post Text" into a single column
dropped_filtered_columns_dataframe['Combined Unigrams'] = dropped_filtered_columns_dataframe['Unigrams Title'] + dropped_filtered_columns_dataframe['Unigrams Post Text']

# Convert the lists of unigrams into strings
dropped_filtered_columns_dataframe['Combined Unigrams'] = dropped_filtered_columns_dataframe['Combined Unigrams'].apply(lambda x: ' '.join(x))

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the combined unigrams
unigram_counts = vectorizer.fit_transform(dropped_filtered_columns_dataframe['Combined Unigrams'])

# Get the feature names (unigrams)
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame to store the counts
unigram_counts_df = pd.DataFrame(unigram_counts.toarray(), columns=feature_names)

# Add the subreddit column back to the DataFrame
unigram_counts_df['Subreddit'] = dropped_filtered_columns_dataframe['Subreddit']

# Group the DataFrame by "Subreddit" and sum the counts
grouped_unigram_counts = unigram_counts_df.groupby('Subreddit').sum()

# Get a list of unique subreddits
unique_subreddits = dropped_filtered_columns_dataframe['Subreddit'].unique()

# Dictionary to store unigram counts for each subreddit
subreddit_unigram_counts = {}

# Loop through each unique subreddit
for subreddit_name in unique_subreddits:
    # Get the specific unigram count for the subreddit
    specific_unigram_count = grouped_unigram_counts.loc[subreddit_name]
    
    # To get the top N most common unigrams for a subreddit, you can use:
    top_n = 10  # Replace with the desired number
    top_n_unigrams = specific_unigram_count.nlargest(top_n)
    
    # Store the top N unigrams in the dictionary
    subreddit_unigram_counts[subreddit_name] = top_n_unigrams

# Loop through each subreddit and its top N unigrams
for subreddit_name, top_n_unigrams in subreddit_unigram_counts.items():
    print(f"Subreddit: {subreddit_name}")
    print(top_n_unigrams)
    print()

In [None]:
# Convert the dictionary to a DataFrame
df_unigrams = pd.DataFrame(subreddit_unigram_counts)

# Plot the data as a bar chart
ax = df_unigrams.plot(kind='bar', figsize=(10, 6))
plt.title('Top Unigrams in Subreddits')
plt.xlabel('Unigrams')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Initialize the CountVectorizer for bigrams
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_counts = bigram_vectorizer.fit_transform(dropped_filtered_columns_dataframe['Combined Unigrams'])
bigram_feature_names = bigram_vectorizer.get_feature_names_out()

# Create a DataFrame to store the bigram counts
bigram_counts_df = pd.DataFrame(bigram_counts.toarray(), columns=bigram_feature_names)

# Add the 'Subreddit' column back to the DataFrame
bigram_counts_df['Subreddit'] = dropped_filtered_columns_dataframe['Subreddit']

# Group the DataFrame by 'Subreddit' and sum the counts
grouped_bigram_counts = bigram_counts_df.groupby('Subreddit').sum()

# Get a list of unique subreddits
unique_subreddits = dropped_filtered_columns_dataframe['Subreddit'].unique()

# Dictionary to store bigram counts for each subreddit
subreddit_bigram_counts = {}

# Loop through each unique subreddit
for subreddit_name in unique_subreddits:
    # Get the specific bigram count for the subreddit
    specific_bigram_count = grouped_bigram_counts.loc[subreddit_name]
    
    # To get the top N most common bigrams for a subreddit, you can use:
    top_n = 10  # Replace with the desired number
    top_n_bigrams = specific_bigram_count.nlargest(top_n)
    
    # Store the top N bigrams in the dictionary
    subreddit_bigram_counts[subreddit_name] = top_n_bigrams

# Loop through each subreddit and its top N bigrams
for subreddit_name, top_n_bigrams in subreddit_bigram_counts.items():
    print(f"Subreddit: {subreddit_name}")
    print(top_n_bigrams)
    print()

In [None]:
# Convert the dictionary to a DataFrame
df_bigram = pd.DataFrame(subreddit_bigram_counts)

# Plot the data as a bar chart
ax = df_bigram.plot(kind='bar', figsize=(10, 6))
plt.title('Top Bigrams in Subreddits')
plt.xlabel('Bigrams')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Initialize the CountVectorizer for trigrams
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
trigram_counts = trigram_vectorizer.fit_transform(dropped_filtered_columns_dataframe['Combined Unigrams'])
trigram_feature_names = trigram_vectorizer.get_feature_names_out()

# Create a DataFrame to store the trigram counts
trigram_counts_df = pd.DataFrame(trigram_counts.toarray(), columns=trigram_feature_names)

# Add the 'Subreddit' column back to the DataFrame
trigram_counts_df['Subreddit'] = dropped_filtered_columns_dataframe['Subreddit']

# Group the DataFrame by 'Subreddit' and sum the counts
grouped_trigram_counts = trigram_counts_df.groupby('Subreddit').sum()

# Get a list of unique subreddits
unique_subreddits = dropped_filtered_columns_dataframe['Subreddit'].unique()

# Dictionary to store trigram counts for each subreddit
subreddit_trigram_counts = {}

# Loop through each unique subreddit
for subreddit_name in unique_subreddits:
    # Get the specific trigram count for the subreddit
    specific_trigram_count = grouped_trigram_counts.loc[subreddit_name]
    
    # To get the top N most common trigrams for a subreddit, you can use:
    top_n = 10  # Replace with the desired number
    top_n_trigrams = specific_trigram_count.nlargest(top_n)
    
    # Store the top N trigrams in the dictionary
    subreddit_trigram_counts[subreddit_name] = top_n_trigrams

# Loop through each subreddit and its top N trigrams
for subreddit_name, top_n_trigrams in subreddit_trigram_counts.items():
    print(f"Subreddit: {subreddit_name}")
    print(top_n_trigrams)
    print()

In [None]:
# Convert the dictionary to a DataFrame
df_trigram = pd.DataFrame(subreddit_trigram_counts)

# Plot the data as a bar chart
ax = df_trigram.plot(kind='bar', figsize=(10, 6))
plt.title('Top Trigrams in Subreddits')
plt.xlabel('Trigrams')
plt.ylabel('Frequency')
plt.xticks(ha='right')

# Show the plot
plt.tight_layout()
plt.show()