In [1]:
# Standard Library Imports
import pandas as pd
import re
import time
import datetime
from collections import defaultdict, Counter

# Third-party Library Imports
import requests
from bs4 import BeautifulSoup
import praw
import nltk
import numpy as np
import matplotlib.pyplot as plt
import concurrent.futures
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import bigrams
from sklearn.feature_extraction.text import CountVectorizer
import spacy

In [2]:
# Set pandas display options to show the entire content of the "Post Text" column
pd.set_option('display.max_colwidth', None)

In [3]:
# Specify the path to your CSV file
csv_file_path = 'reddit_posts.csv'

# Read the CSV file into a DataFrame
df_reddit_post = pd.read_csv(csv_file_path)

# Now, 'df' contains your data from the CSV file

len(df_reddit_post)

3963

In [4]:
# Define a function to check if a post is an image-only post
def is_image_only_post(post):
    # Check if the post's content (Post Text) is empty (no text) and the URL is an image or an image-hosting site
    return not post["Post Text"] and post["Post URL"].endswith(('.jpg', '.jpeg', '.png', '.gif', '.gifv', '.webp'))

# Apply the function to the DataFrame to create a Boolean mask
image_only_mask = df_reddit_post.apply(is_image_only_post, axis=1)

# Filter out image-only posts
filtered_df_reddit_post = df_reddit_post[~image_only_mask]
print(f"Total number of posts after filter collected: {len(filtered_df_reddit_post)}")

Total number of posts after filter collected: 3963


In [5]:
# Create a new DataFrame with the selected columns
selected_columns = ["Title", "Post Text", "Subreddit"]
filtered_columns_df_reddit_post = filtered_df_reddit_post[selected_columns]
# Save the data to a CSV file
filtered_columns_df_reddit_post.to_csv("reddit_posts (no image-only post).csv", index=False)
filtered_columns_df_reddit_post.head()

Unnamed: 0,Title,Post Text,Subreddit
0,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc.)\n* **Context** of fast (start, end, day x of y, etc.)\n* **Length** of fast (8 hours, 3 days, etc.)\n* **Why?** What you hope to accomplish with your fast\n* **Notes** How is it going so far? Any concerns? Insights to share?\n\nBe sure to check back often as comments get posted throughout the day. Sort comments by ""new"" to be sure the newer comments get some love as well.",intermittentfasting
1,F/29/5’8” [246 > 134 = 112lbs]. I’ve been maintaining for a couple months now. AMA about IF and weight loss,,intermittentfasting
2,50 lbs down - 3 months,Started IF on June 19th and have now passed the 50lb mark this morning. Started on 16:8 eating breakfast and lunch only. Slowly moved up to 19:5. Exercise 2-3 days a week with either rowing or walking. I am using a eufy scale to weigh each morning and then manually measure every Sunday. I don't have a strict diet but I am trying to avoid as much sugar as possible. \n\nSW: 305\nCW: 255\nGW: 220,intermittentfasting
3,222 to 195 in 6 months,"Besides the weight I have zero aches and pains, can breathe better and my medical blood work went from 4 red flags to 0.",intermittentfasting
4,So far it seems to be working!,"[22F - 4'10"" - SW: 143.4, CW: 138.2, GW: 115]\n\nI started 16:8 with a calorie deficit on 9/13/23. I last weighed myself on 9/19/23. I went from 143.4lbs to 138.2lbs in almost a week. I’m probably below 138lbs now, but I’m not going to weigh myself again until the end of the month. With my height, 5lbs seems to make a big difference. \n\nI’m sure a lot of it was water weight and bloating. I’m still happy though, I feel better and it gives me even more motivation to stick with it.",intermittentfasting


In [6]:
# Iterate through all columns and convert text data to lowercase
for column in filtered_columns_df_reddit_post.columns:
    if filtered_columns_df_reddit_post[column].dtype == 'object':
        filtered_columns_df_reddit_post[column] = filtered_columns_df_reddit_post[column].str.lower()

# Print the resulting DataFrame
filtered_columns_df_reddit_post

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_columns_df_reddit_post[column] = filtered_columns_df_reddit_post[column].str.lower()


Unnamed: 0,Title,Post Text,Subreddit
0,daily fasting check-in!,"* **type** of fast (water, juice, smoking, etc.)\n* **context** of fast (start, end, day x of y, etc.)\n* **length** of fast (8 hours, 3 days, etc.)\n* **why?** what you hope to accomplish with your fast\n* **notes** how is it going so far? any concerns? insights to share?\n\nbe sure to check back often as comments get posted throughout the day. sort comments by ""new"" to be sure the newer comments get some love as well.",intermittentfasting
1,f/29/5’8” [246 > 134 = 112lbs]. i’ve been maintaining for a couple months now. ama about if and weight loss,,intermittentfasting
2,50 lbs down - 3 months,started if on june 19th and have now passed the 50lb mark this morning. started on 16:8 eating breakfast and lunch only. slowly moved up to 19:5. exercise 2-3 days a week with either rowing or walking. i am using a eufy scale to weigh each morning and then manually measure every sunday. i don't have a strict diet but i am trying to avoid as much sugar as possible. \n\nsw: 305\ncw: 255\ngw: 220,intermittentfasting
3,222 to 195 in 6 months,"besides the weight i have zero aches and pains, can breathe better and my medical blood work went from 4 red flags to 0.",intermittentfasting
4,so far it seems to be working!,"[22f - 4'10"" - sw: 143.4, cw: 138.2, gw: 115]\n\ni started 16:8 with a calorie deficit on 9/13/23. i last weighed myself on 9/19/23. i went from 143.4lbs to 138.2lbs in almost a week. i’m probably below 138lbs now, but i’m not going to weigh myself again until the end of the month. with my height, 5lbs seems to make a big difference. \n\ni’m sure a lot of it was water weight and bloating. i’m still happy though, i feel better and it gives me even more motivation to stick with it.",intermittentfasting
...,...,...,...
3958,anyone else have a fear of becoming obese?,"i know that fear of gaining weight is a pretty much universal thing for anorexia, but does anyone else have a fear of actual obesity? like i’m genuinely terrified that once i start eating more i just won’t stop. i’ll just keep eating and eating and never stop until i’m 300 pounds because i lost all my self control when i stopped restricting. plus hearing about how many people go from anorexia to binge eating disorder doesn’t help either. i know it’s not like that for everyone but what if it is for me? i’m so terrified of it that i literally have nightmares about it, and imaging it happening makes me feel sick with anxiety. i just hope i’m not the only one cus i feel crazy",anorexianervosa
3959,i am so alone.,"i’m a 16 year old male who’s been suffering from anorexia for two years now, entering my 3rd year and have been fake recovered to some extent like 5 times now but every time has me going back down the same path i was on when restricting.\n\ni’m approaching the weight i was at when i was at my worst and everyday i have to deal with constantly being body shamed by my family/friends who have no idea what i’m going through no matter how many attempts i’ve given to explain fully about what i’m going through.\n\nbecause i’m a boy i don’t get a lot of recognition for these struggles, i’m just “the skinny one who doesn’t eat a lot” in peoples eyes.\n\non top of this i live in a place where there’s virtually nowhere to go for eating disorders, there’s no programs or people to help me. i feel like i’m approaching the end of my road and at this point i am ready to stop fighting and give up. i want to eat, i want to recover, but i feel like it’s a forever cycle i’m stuck on of restricting, hitting rock bottom, binging, and then restricting again.",anorexianervosa
3960,chocolate ice cream!,"ok, so it's not a lot, and i didn't use to be as scared of liquid calories (i count ice cream towards liquid calories) as i am now. but i'm eating chocolate ice cream! and i'm so proud of myself! even if i do feel sick (because of motion sickness), i'm still going to eat this entire scoop of ice cream!",anorexianervosa
3961,do you experience oddly specific challenges right after recovering that tempt you back into your eating disorder?,"do y’all find that as soon as you get to a good place in recovery something comes along that wants to push you right back? whether it’s a triggering comment, a break up, a new relationship, someone close to you going on a diet, or some other triggering situation? \n\ni know that challenges will come but it’s just crazy how quickly and specific they seem to be to your own personal weaknesses. like for me as soon as i was getting better my now boyfriend asked me out. which was great but also what starting my eating disorder in the first place years ago so it was soooo hard to not relapse (& i kinda did a bit). \n\ni’d love to hear your stories with this if it’s similar! i just feel like i’ve noticed this theme.",anorexianervosa


In [7]:
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\weege\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\weege\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\weege\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weege\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# Get the list of English stopwords
stop_words = set(stopwords.words("english"))

# Function to filter out stopwords
def remove_stopwords(text):
    if isinstance(text, str):  # Check if the value is a string (not NaN)
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return " ".join(filtered_words)
    else:
        return text  # Return the original value if it's NaN

# Apply the function to the "Title" and "Post Text" columns using .loc
filtered_columns_df_reddit_post["Stopword Dropped Title"] = filtered_columns_df_reddit_post["Title"].apply(remove_stopwords)
filtered_columns_df_reddit_post["Stopword Dropped Post Text"] = filtered_columns_df_reddit_post["Post Text"].apply(remove_stopwords)

# Create a clean copy of the DataFrame with the dropped columns
dropped_filtered_columns_dataframe = filtered_columns_df_reddit_post.drop(columns=["Title", "Post Text"]).copy()

# Display the DataFrame with the dropped columns
dropped_filtered_columns_dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_columns_df_reddit_post["Stopword Dropped Title"] = filtered_columns_df_reddit_post["Title"].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_columns_df_reddit_post["Stopword Dropped Post Text"] = filtered_columns_df_reddit_post["Post Text"].apply(remove_stopwords)


Unnamed: 0,Subreddit,Stopword Dropped Title,Stopword Dropped Post Text
0,intermittentfasting,daily fasting check-in!,"* **type** fast (water, juice, smoking, etc.) * **context** fast (start, end, day x y, etc.) * **length** fast (8 hours, 3 days, etc.) * **why?** hope accomplish fast * **notes** going far? concerns? insights share? sure check back often comments get posted throughout day. sort comments ""new"" sure newer comments get love well."
1,intermittentfasting,f/29/5’8” [246 > 134 = 112lbs]. i’ve maintaining couple months now. ama weight loss,
2,intermittentfasting,50 lbs - 3 months,started june 19th passed 50lb mark morning. started 16:8 eating breakfast lunch only. slowly moved 19:5. exercise 2-3 days week either rowing walking. using eufy scale weigh morning manually measure every sunday. strict diet trying avoid much sugar possible. sw: 305 cw: 255 gw: 220
3,intermittentfasting,222 195 6 months,"besides weight zero aches pains, breathe better medical blood work went 4 red flags 0."
4,intermittentfasting,far seems working!,"[22f - 4'10"" - sw: 143.4, cw: 138.2, gw: 115] started 16:8 calorie deficit 9/13/23. last weighed 9/19/23. went 143.4lbs 138.2lbs almost week. i’m probably 138lbs now, i’m going weigh end month. height, 5lbs seems make big difference. i’m sure lot water weight bloating. i’m still happy though, feel better gives even motivation stick it."
...,...,...,...
3958,anorexianervosa,anyone else fear becoming obese?,"know fear gaining weight pretty much universal thing anorexia, anyone else fear actual obesity? like i’m genuinely terrified start eating won’t stop. i’ll keep eating eating never stop i’m 300 pounds lost self control stopped restricting. plus hearing many people go anorexia binge eating disorder doesn’t help either. know it’s like everyone me? i’m terrified literally nightmares it, imaging happening makes feel sick anxiety. hope i’m one cus feel crazy"
3959,anorexianervosa,alone.,"i’m 16 year old male who’s suffering anorexia two years now, entering 3rd year fake recovered extent like 5 times every time going back path restricting. i’m approaching weight worst everyday deal constantly body shamed family/friends idea i’m going matter many attempts i’ve given explain fully i’m going through. i’m boy don’t get lot recognition struggles, i’m “the skinny one doesn’t eat lot” peoples eyes. top live place there’s virtually nowhere go eating disorders, there’s programs people help me. feel like i’m approaching end road point ready stop fighting give up. want eat, want recover, feel like it’s forever cycle i’m stuck restricting, hitting rock bottom, binging, restricting again."
3960,anorexianervosa,chocolate ice cream!,"ok, lot, use scared liquid calories (i count ice cream towards liquid calories) now. i'm eating chocolate ice cream! i'm proud myself! even feel sick (because motion sickness), i'm still going eat entire scoop ice cream!"
3961,anorexianervosa,experience oddly specific challenges right recovering tempt back eating disorder?,"y’all find soon get good place recovery something comes along wants push right back? whether it’s triggering comment, break up, new relationship, someone close going diet, triggering situation? know challenges come it’s crazy quickly specific seem personal weaknesses. like soon getting better boyfriend asked out. great also starting eating disorder first place years ago soooo hard relapse (& kinda bit). i’d love hear stories it’s similar! feel like i’ve noticed theme."


In [9]:
# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Create a function to process and modify text
def process_text(text):
    if isinstance(text, str):
        doc = nlp(text)
        modified_tokens = []
        for token in doc:
            if token.text.lower() == "i'm":
                modified_tokens.extend(["i", "am"])
            elif token.text.lower() == "emma's":
                modified_tokens.append("emma")
            else:
                modified_tokens.append(token.text)
        return " ".join(modified_tokens)
    else:
        return text

# Loop through all columns and apply the processing function
for column in dropped_filtered_columns_dataframe.columns:
    if dropped_filtered_columns_dataframe[column].dtype == 'object':
        dropped_filtered_columns_dataframe[column] = dropped_filtered_columns_dataframe[column].apply(process_text)

# Print the resulting DataFrame
dropped_filtered_columns_dataframe

Unnamed: 0,Subreddit,Stopword Dropped Title,Stopword Dropped Post Text
0,intermittentfasting,daily fasting check - in !,"* * * type * * fast ( water , juice , smoking , etc . ) * * * context * * fast ( start , end , day x y , etc . ) * * * length * * fast ( 8 hours , 3 days , etc . ) * * * why ? * * hope accomplish fast * * * notes * * going far ? concerns ? insights share ? sure check back often comments get posted throughout day . sort comments "" new "" sure newer comments get love well ."
1,intermittentfasting,f/29/5’8 ” [ 246 > 134 = 112lbs ] . i ’ve maintaining couple months now . ama weight loss,
2,intermittentfasting,50 lbs - 3 months,started june 19th passed 50 lb mark morning . started 16:8 eating breakfast lunch only . slowly moved 19:5 . exercise 2 - 3 days week either rowing walking . using eufy scale weigh morning manually measure every sunday . strict diet trying avoid much sugar possible . sw : 305 cw : 255 gw : 220
3,intermittentfasting,222 195 6 months,"besides weight zero aches pains , breathe better medical blood work went 4 red flags 0 ."
4,intermittentfasting,far seems working !,"[ 22f - 4'10 "" - sw : 143.4 , cw : 138.2 , gw : 115 ] started 16:8 calorie deficit 9/13/23 . last weighed 9/19/23 . went 143.4lbs 138.2lbs almost week . i ’m probably 138lbs now , i ’m going weigh end month . height , 5lbs seems make big difference . i ’m sure lot water weight bloating . i ’m still happy though , feel better gives even motivation stick it ."
...,...,...,...
3958,anorexianervosa,anyone else fear becoming obese ?,"know fear gaining weight pretty much universal thing anorexia , anyone else fear actual obesity ? like i ’m genuinely terrified start eating wo n’t stop . i ’ll keep eating eating never stop i ’m 300 pounds lost self control stopped restricting . plus hearing many people go anorexia binge eating disorder does n’t help either . know it ’s like everyone me ? i ’m terrified literally nightmares it , imaging happening makes feel sick anxiety . hope i ’m one cus feel crazy"
3959,anorexianervosa,alone .,"i ’m 16 year old male who ’s suffering anorexia two years now , entering 3rd year fake recovered extent like 5 times every time going back path restricting . i ’m approaching weight worst everyday deal constantly body shamed family / friends idea i ’m going matter many attempts i ’ve given explain fully i ’m going through . i ’m boy do n’t get lot recognition struggles , i ’m “ the skinny one does n’t eat lot ” peoples eyes . top live place there ’s virtually nowhere go eating disorders , there ’s programs people help me . feel like i ’m approaching end road point ready stop fighting give up . want eat , want recover , feel like it ’s forever cycle i ’m stuck restricting , hitting rock bottom , binging , restricting again ."
3960,anorexianervosa,chocolate ice cream !,"ok , lot , use scared liquid calories ( i count ice cream towards liquid calories ) now . i 'm eating chocolate ice cream ! i 'm proud myself ! even feel sick ( because motion sickness ) , i 'm still going eat entire scoop ice cream !"
3961,anorexianervosa,experience oddly specific challenges right recovering tempt back eating disorder ?,"y’ all find soon get good place recovery something comes along wants push right back ? whether it ’s triggering comment , break up , new relationship , someone close going diet , triggering situation ? know challenges come it ’s crazy quickly specific seem personal weaknesses . like soon getting better boyfriend asked out . great also starting eating disorder first place years ago soooo hard relapse ( & kinda bit ) . i ’d love hear stories it ’s similar ! feel like i ’ve noticed theme ."


In [10]:
# Remove punctuation from specific columns
columns_to_clean = ['Stopword Dropped Post Text', 'Stopword Dropped Title']

for column in columns_to_clean:
    dropped_filtered_columns_dataframe[column] = dropped_filtered_columns_dataframe[column].str.replace(r'[^\w\s]', '', regex=True)

# Display the DataFrame
dropped_filtered_columns_dataframe

Unnamed: 0,Subreddit,Stopword Dropped Title,Stopword Dropped Post Text
0,intermittentfasting,daily fasting check in,type fast water juice smoking etc context fast start end day x y etc length fast 8 hours 3 days etc why hope accomplish fast notes going far concerns insights share sure check back often comments get posted throughout day sort comments new sure newer comments get love well
1,intermittentfasting,f2958 246 134 112lbs i ve maintaining couple months now ama weight loss,
2,intermittentfasting,50 lbs 3 months,started june 19th passed 50 lb mark morning started 168 eating breakfast lunch only slowly moved 195 exercise 2 3 days week either rowing walking using eufy scale weigh morning manually measure every sunday strict diet trying avoid much sugar possible sw 305 cw 255 gw 220
3,intermittentfasting,222 195 6 months,besides weight zero aches pains breathe better medical blood work went 4 red flags 0
4,intermittentfasting,far seems working,22f 410 sw 1434 cw 1382 gw 115 started 168 calorie deficit 91323 last weighed 91923 went 1434lbs 1382lbs almost week i m probably 138lbs now i m going weigh end month height 5lbs seems make big difference i m sure lot water weight bloating i m still happy though feel better gives even motivation stick it
...,...,...,...
3958,anorexianervosa,anyone else fear becoming obese,know fear gaining weight pretty much universal thing anorexia anyone else fear actual obesity like i m genuinely terrified start eating wo nt stop i ll keep eating eating never stop i m 300 pounds lost self control stopped restricting plus hearing many people go anorexia binge eating disorder does nt help either know it s like everyone me i m terrified literally nightmares it imaging happening makes feel sick anxiety hope i m one cus feel crazy
3959,anorexianervosa,alone,i m 16 year old male who s suffering anorexia two years now entering 3rd year fake recovered extent like 5 times every time going back path restricting i m approaching weight worst everyday deal constantly body shamed family friends idea i m going matter many attempts i ve given explain fully i m going through i m boy do nt get lot recognition struggles i m the skinny one does nt eat lot peoples eyes top live place there s virtually nowhere go eating disorders there s programs people help me feel like i m approaching end road point ready stop fighting give up want eat want recover feel like it s forever cycle i m stuck restricting hitting rock bottom binging restricting again
3960,anorexianervosa,chocolate ice cream,ok lot use scared liquid calories i count ice cream towards liquid calories now i m eating chocolate ice cream i m proud myself even feel sick because motion sickness i m still going eat entire scoop ice cream
3961,anorexianervosa,experience oddly specific challenges right recovering tempt back eating disorder,y all find soon get good place recovery something comes along wants push right back whether it s triggering comment break up new relationship someone close going diet triggering situation know challenges come it s crazy quickly specific seem personal weaknesses like soon getting better boyfriend asked out great also starting eating disorder first place years ago soooo hard relapse kinda bit i d love hear stories it s similar feel like i ve noticed theme


In [17]:
# Combine the "Stopword Dropped Title" and "Stopword Dropped Post Text" columns into a single column
dropped_filtered_columns_dataframe['Combined Text'] = dropped_filtered_columns_dataframe['Stopword Dropped Title'].fillna('') + ' ' + dropped_filtered_columns_dataframe['Stopword Dropped Post Text'].fillna('')  # Replace NaN with empty string ''

# Initialize the CountVectorizer for unigrams, bigrams, and trigrams
ngram_range = (1, 3)  # Change to (1, 1) for unigrams, (2, 2) for bigrams, or (1, 2) for unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=ngram_range)

# Fit and transform the combined text into unigrams, bigrams, and trigrams
ngram_counts = vectorizer.fit_transform(dropped_filtered_columns_dataframe['Combined Text'])
ngram_feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame to store the n-gram counts
ngram_counts_df = pd.DataFrame(ngram_counts.toarray(), columns=ngram_feature_names)

# Add the 'Subreddit' column back to the DataFrame
ngram_counts_df['Subreddit'] = dropped_filtered_columns_dataframe['Subreddit']

# Group the DataFrame by 'Subreddit' and sum the counts
grouped_ngram_counts = ngram_counts_df.groupby('Subreddit').sum()

# Get a list of unique subreddits
unique_subreddits = dropped_filtered_columns_dataframe['Subreddit'].unique()

# Dictionary to store n-gram counts for each subreddit
subreddit_ngram_counts = {}

# Loop through each unique subreddit
for subreddit_name in unique_subreddits:
    # Get the specific n-gram count for the subreddit
    specific_ngram_count = grouped_ngram_counts.loc[subreddit_name]
    
    # To get the top N most common n-grams for a subreddit, you can use:
    top_n = 10  # Replace with the desired number
    top_n_ngrams = specific_ngram_count.nlargest(top_n)
    
    # Store the top N n-grams in the dictionary
    subreddit_ngram_counts[subreddit_name] = top_n_ngrams

# Now, subreddit_ngram_counts contains the combined unigram, bigram, and trigram data for creating a bar chart or further analysis.

In [16]:
# Loop through each subreddit and its n-grams
for subreddit_name, ngram_counts in subreddit_ngram_counts.items():
    print(f"Subreddit: {subreddit_name}")
    
    # Print unigrams
    print("Unigrams:")
    unigrams = [ngram for ngram in ngram_counts.index if len(ngram.split()) == 1]
    for unigram in unigrams:
        count = ngram_counts[unigram]
        print(f"{unigram}: {count}")
    
    # Print bigrams
    print("\nBigrams:")
    bigrams = [ngram for ngram in ngram_counts.index if len(ngram.split()) == 2]
    for bigram in bigrams:
        count = ngram_counts[bigram]
        print(f"{bigram}: {count}")
    
    # Print trigrams
    print("\nTrigrams:")
    trigrams = [ngram for ngram in ngram_counts.index if len(ngram.split()) == 3]
    for trigram in trigrams:
        count = ngram_counts[trigram]
        print(f"{trigram}: {count}")
    
    print()

Subreddit: intermittentfasting
Unigrams:
weight: 815
fasting: 764
ve: 652
fast: 645
day: 567
nt: 531
it: 472
eating: 467
days: 447
eat: 411

Bigrams:

Trigrams:

Subreddit: anorexianervosa
Unigrams:
nt: 2333
like: 1819
feel: 1423
it: 1262
weight: 1152
do: 1129
ve: 1077
eating: 1031
know: 1030
eat: 1003

Bigrams:

Trigrams:



In [None]:
# Convert the dictionary to a DataFrame
df_unigrams = pd.DataFrame(subreddit_unigram_counts)

# Plot the data as a bar chart
ax = df_unigrams.plot(kind='bar', figsize=(10, 6))
plt.title('Top Unigrams in Subreddits')
plt.xlabel('Unigrams')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Initialize the CountVectorizer for bigrams
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_counts = bigram_vectorizer.fit_transform(dropped_filtered_columns_dataframe['Combined Unigrams'])
bigram_feature_names = bigram_vectorizer.get_feature_names_out()

# Create a DataFrame to store the bigram counts
bigram_counts_df = pd.DataFrame(bigram_counts.toarray(), columns=bigram_feature_names)

# Add the 'Subreddit' column back to the DataFrame
bigram_counts_df['Subreddit'] = dropped_filtered_columns_dataframe['Subreddit']

# Group the DataFrame by 'Subreddit' and sum the counts
grouped_bigram_counts = bigram_counts_df.groupby('Subreddit').sum()

# Get a list of unique subreddits
unique_subreddits = dropped_filtered_columns_dataframe['Subreddit'].unique()

# Dictionary to store bigram counts for each subreddit
subreddit_bigram_counts = {}

# Loop through each unique subreddit
for subreddit_name in unique_subreddits:
    # Get the specific bigram count for the subreddit
    specific_bigram_count = grouped_bigram_counts.loc[subreddit_name]
    
    # To get the top N most common bigrams for a subreddit, you can use:
    top_n = 10  # Replace with the desired number
    top_n_bigrams = specific_bigram_count.nlargest(top_n)
    
    # Store the top N bigrams in the dictionary
    subreddit_bigram_counts[subreddit_name] = top_n_bigrams

# Loop through each subreddit and its top N bigrams
for subreddit_name, top_n_bigrams in subreddit_bigram_counts.items():
    print(f"Subreddit: {subreddit_name}")
    print(top_n_bigrams)
    print()

In [None]:
# Convert the dictionary to a DataFrame
df_bigram = pd.DataFrame(subreddit_bigram_counts)

# Plot the data as a bar chart
ax = df_bigram.plot(kind='bar', figsize=(10, 6))
plt.title('Top Bigrams in Subreddits')
plt.xlabel('Bigrams')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Initialize the CountVectorizer for trigrams
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
trigram_counts = trigram_vectorizer.fit_transform(dropped_filtered_columns_dataframe['Combined Unigrams'])
trigram_feature_names = trigram_vectorizer.get_feature_names_out()

# Create a DataFrame to store the trigram counts
trigram_counts_df = pd.DataFrame(trigram_counts.toarray(), columns=trigram_feature_names)

# Add the 'Subreddit' column back to the DataFrame
trigram_counts_df['Subreddit'] = dropped_filtered_columns_dataframe['Subreddit']

# Group the DataFrame by 'Subreddit' and sum the counts
grouped_trigram_counts = trigram_counts_df.groupby('Subreddit').sum()

# Get a list of unique subreddits
unique_subreddits = dropped_filtered_columns_dataframe['Subreddit'].unique()

# Dictionary to store trigram counts for each subreddit
subreddit_trigram_counts = {}

# Loop through each unique subreddit
for subreddit_name in unique_subreddits:
    # Get the specific trigram count for the subreddit
    specific_trigram_count = grouped_trigram_counts.loc[subreddit_name]
    
    # To get the top N most common trigrams for a subreddit, you can use:
    top_n = 10  # Replace with the desired number
    top_n_trigrams = specific_trigram_count.nlargest(top_n)
    
    # Store the top N trigrams in the dictionary
    subreddit_trigram_counts[subreddit_name] = top_n_trigrams

# Loop through each subreddit and its top N trigrams
for subreddit_name, top_n_trigrams in subreddit_trigram_counts.items():
    print(f"Subreddit: {subreddit_name}")
    print(top_n_trigrams)
    print()

In [None]:
# Convert the dictionary to a DataFrame
df_trigram = pd.DataFrame(subreddit_trigram_counts)

# Plot the data as a bar chart
ax = df_trigram.plot(kind='bar', figsize=(10, 6))
plt.title('Top Trigrams in Subreddits')
plt.xlabel('Trigrams')
plt.ylabel('Frequency')
plt.xticks(ha='right')

# Show the plot
plt.tight_layout()
plt.show()