[Click here to see the notebook used to scrap the data](Project%203%20Scrapping%20(caa%20250923%202059).ipynb)

In [1]:
# Standard Library Imports
import pandas as pd
import re
import datetime

# Third-party Library Imports
import requests
from bs4 import BeautifulSoup
import praw
import nltk
import numpy as np
import matplotlib.pyplot as plt
import concurrent.futures
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import time
import itertools
from collections import defaultdict, Counter
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import bigrams
from sklearn.feature_extraction.text import CountVectorizer
import string

# Custom Functions or Classes (if applicable)

In [13]:
#Opening the newly created csv file

reddit_df = pd.read_csv('reddit_posts_datetime.csv')

#Checking the columns for the primary datatype
data_types1 = reddit_df.dtypes

print(data_types1)

unique_subreddits = reddit_df['Subreddit'].unique()
unique_post_types = reddit_df['Post Type'].unique()
unique_id = reddit_df['ID'].unique()

print(unique_subreddits)
print(unique_post_types)
print(len(unique_id))

reddit_df

Title             object
Post Text         object
ID                object
Score              int64
Total Comments     int64
Post URL          object
Subreddit         object
Post Type         object
Time uploaded     object
dtype: object
['intermittentfasting' 'AnorexiaNervosa']
['new' 'hot' 'top']
3960


Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL,Subreddit,Post Type,Time uploaded
0,Does taking flavoured creatine break a fast?,"Taking one scoop, roughly 3g. It has sucralose...",16shh83,1,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 07:57:13
1,I lost 120 lbs.......she lost 80. One meal a d...,,16shbmz,6,1,https://i.redd.it/cft42u8lso151.jpg,intermittentfasting,new,2023-09-26 07:46:54
2,Does fasting out of spite work?,We’ll see in 4 weeks when I go to a wedding wh...,16sfrlc,0,2,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 06:10:27
3,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",16sfl07,1,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 06:00:31
4,90 Days of Intermittent Fasting - IT WORKS!,"Hi Everyone, \n\nToday was the 90th day of my ...",16sdl2e,17,8,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 04:10:24
...,...,...,...,...,...,...,...,...,...
3955,anyone else have a fear of becoming obese?,i know that fear of gaining weight is a pretty...,kwdigq,88,24,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,2021-01-13 10:02:54
3956,I am so alone.,I’m a 16 year old male who’s been suffering fr...,kq3m1u,87,15,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,2021-01-04 07:11:00
3957,Chocolate Ice Cream!,"Ok, so it's not a lot, and I didn't use to be ...",kc3nau,87,16,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,2020-12-13 03:51:10
3958,do you experience oDdLy specific challenges RI...,Do y’all find that as SOON as you get to a goo...,k4lpsn,88,17,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,2020-12-01 14:20:46


In [3]:
#Lowercasing and snake_casing the header

reddit_df.columns = [col.lower().replace(" ", "_") for col in reddit_df.columns]
reddit_df.head()

Unnamed: 0,title,post_text,id,score,total_comments,post_url,subreddit,post_type,time_uploaded
0,Does taking flavoured creatine break a fast?,"Taking one scoop, roughly 3g. It has sucralose...",16shh83,1,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 07:57:13
1,I lost 120 lbs.......she lost 80. One meal a d...,,16shbmz,6,1,https://i.redd.it/cft42u8lso151.jpg,intermittentfasting,new,2023-09-26 07:46:54
2,Does fasting out of spite work?,We’ll see in 4 weeks when I go to a wedding wh...,16sfrlc,0,2,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 06:10:27
3,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",16sfl07,1,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 06:00:31
4,90 Days of Intermittent Fasting - IT WORKS!,"Hi Everyone, \n\nToday was the 90th day of my ...",16sdl2e,17,8,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 04:10:24


We needed as much text for our analysis as possible, including the title. This is due to some people posting only having a title and an image. Such as this: 

<a href="https://www.reddit.com/r/intermittentfasting/comments/16shbmz/i_lost_120_lbsshe_lost_80_one_meal_a_day_from/">
    <figure>
        <img src="Reddit_post_with_image_and_title_with_no_text_example.png" width="250" height="250" />
        <figcaption>A typical image-only reddit post</figcaption>
    </figure>
</a>

In [4]:
# Create new column 'title_&_text', an addition of words from the 'title' and 'post_text' columns
# To get more words and gather more sentiments

reddit_df['title_and_text'] = reddit_df['title'] + ' ' + reddit_df['post_text']
reddit_df.head()

Unnamed: 0,title,post_text,id,score,total_comments,post_url,subreddit,post_type,time_uploaded,title_and_text
0,Does taking flavoured creatine break a fast?,"Taking one scoop, roughly 3g. It has sucralose...",16shh83,1,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 07:57:13,Does taking flavoured creatine break a fast? T...
1,I lost 120 lbs.......she lost 80. One meal a d...,,16shbmz,6,1,https://i.redd.it/cft42u8lso151.jpg,intermittentfasting,new,2023-09-26 07:46:54,
2,Does fasting out of spite work?,We’ll see in 4 weeks when I go to a wedding wh...,16sfrlc,0,2,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 06:10:27,Does fasting out of spite work? We’ll see in 4...
3,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",16sfl07,1,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 06:00:31,Daily Fasting Check-in! * **Type** of fast (wa...
4,90 Days of Intermittent Fasting - IT WORKS!,"Hi Everyone, \n\nToday was the 90th day of my ...",16sdl2e,17,8,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,new,2023-09-26 04:10:24,90 Days of Intermittent Fasting - IT WORKS! Hi...


In [5]:
# Count the blank cells in both the "post_text" and the "title" columns
null_count1 = reddit_df["post_text"].isnull().sum()
null_count2 = reddit_df["title"].isnull().sum()

# Display the count of blank cells
print("Number of blank cells in post_text:", null_count1)
print("Number of blank cells in title:", null_count2)

Number of blank cells in post_text: 1464
Number of blank cells in title: 0


In [6]:
#nltk.download("stopwords")
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt')
#Please uncomment the above if you haven't downloaded these libraries.

In [7]:
# Get the list of English stopwords
stop_words = set(stopwords.words("english"))

# Function to filter out stopwords and handle blank cells
def remove_stopwords(text, title):
    if pd.isna(text) or text.strip() == "":
        return title  # Use the title content if text is blank
    
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Apply the function to the "title_&_text" column using .loc
reddit_df["stopword_dropped_title_and_text"] = reddit_df.apply(lambda row: remove_stopwords(row["title_and_text"], row["title"]), axis=1)

# Remove duplicate content in "stopword_dropped_title_and_text" column
reddit_df["stopword_dropped_title_and_text"] = reddit_df["stopword_dropped_title_and_text"].where(reddit_df["stopword_dropped_title_and_text"] != reddit_df["title"], reddit_df["stopword_dropped_title_and_text"].str.split().str[1:].str.join(" "))

# Create a clean copy of the DataFrame with the dropped columns
reddit_stopword_df = reddit_df.drop(columns=["title", "post_text", "title_and_text", "post_url", "id"]).copy()

# Display the DataFrame with the dropped columns
reddit_stopword_df.head()

Unnamed: 0,score,total_comments,subreddit,post_type,time_uploaded,stopword_dropped_title_and_text
0,1,0,intermittentfasting,new,2023-09-26 07:57:13,taking flavoured creatine break fast? Taking o...
1,6,1,intermittentfasting,new,2023-09-26 07:46:54,lost 120 lbs.......she lost 80. One meal a day...
2,0,2,intermittentfasting,new,2023-09-26 06:10:27,fasting spite work? We’ll see 4 weeks go weddi...
3,1,0,intermittentfasting,new,2023-09-26 06:00:31,Daily Fasting Check-in! * **Type** fast (water...
4,17,8,intermittentfasting,new,2023-09-26 04:10:24,90 Days Intermittent Fasting - WORKS! Hi Every...


In [14]:
#Checking datatype after creation of reddit_stopword_df

data_types2 = reddit_stopword_df.dtypes

print(data_types2)

score                               int64
total_comments                      int64
subreddit                          object
post_type                          object
time_uploaded                      object
stopword_dropped_title_and_text    object
dtype: object


In [8]:
# Count the blank cells in the "title_and_text" column
null_count3 = reddit_df["stopword_dropped_title_and_text"].isnull().sum()

# Display the count of blank cells
print("Number of blank cells in stopword_dropped_title_and_text:", null_count3)

Number of blank cells in stopword_dropped_title_and_text: 0


In [9]:
# Define a function that does the following cleaning steps:
## removes punctuation,
## tokenize,
## lowercase,
## removes stopwords,
## stemming,
## lemmatizing

# And returns stemmed text in a column and lemmatized text in the next column

ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

def clean(text):

    remove_punct = "".join([char for char in text if char not in string.punctuation])

    tokenize = re.split('\W+', remove_punct)
    
    lowercase = [word.lower() for word in tokenize]
    
    custom_stopwords = ['eating', 'much', 'make', 'keep', 'something', 
                        'dont', 'last', 'eat', 'back', 'years', 
                        've', 'well', 'body', 'lost', 'still', 
                        'going', 'weight', 'good', 'want', 'every', 
                        'got', 'anyone', 'im', 'fat', 'days', 
                        'need', 'calories', 'like', 'would', 'today', 
                        'started', 'time', 'see', 'first', 'get', 
                        'think', 'months', 'one', 'take', 'know', 
                        'feeling', 'food', 'go', 'since', 'it', 
                        'way', 'ive', 'day', 'help', 'don', 
                        'trying', 'feel', 'really', 'also', 'even', 
                        'better', 'lot']
        
    all_stopwords = stopwords.words('english') + custom_stopwords
    no_stopwords = [word for word in lowercase if word not in all_stopwords]
    
    stemmed = [ps.stem(word) for word in no_stopwords]
    
    lemmatized = [wn.lemmatize(word) for word in no_stopwords]
    
    return (stemmed, lemmatized)

# Create a new DataFrame to store the cleaned data
cleaned_df = reddit_stopword_df.copy()

# Apply the 'clean' function to your original DataFrame
cleaned_df[['title_text_stemmed', 'title_text_lemmatized']] = reddit_stopword_df['stopword_dropped_title_and_text'].apply(clean).apply(pd.Series)

#saving the dataframe as another CSV file
cleaned_df.to_csv("reddit_cleaned_final.csv", index=False)

cleaned_df.head()

Unnamed: 0,score,total_comments,subreddit,post_type,time_uploaded,stopword_dropped_title_and_text,title_text_stemmed,title_text_lemmatized
0,1,0,intermittentfasting,new,2023-09-26 07:57:13,taking flavoured creatine break fast? Taking o...,"[take, flavour, creatin, break, fast, take, sc...","[taking, flavoured, creatine, break, fast, tak..."
1,6,1,intermittentfasting,new,2023-09-26 07:46:54,lost 120 lbs.......she lost 80. One meal a day...,"[120, lbsshe, 80, meal, ]","[120, lbsshe, 80, meal, ]"
2,0,2,intermittentfasting,new,2023-09-26 06:10:27,fasting spite work? We’ll see 4 weeks go weddi...,"[fast, spite, work, 4, week, wed, bh, sister, ...","[fasting, spite, work, 4, week, wedding, bh, s..."
3,1,0,intermittentfasting,new,2023-09-26 06:00:31,Daily Fasting Check-in! * **Type** fast (water...,"[daili, fast, checkin, type, fast, water, juic...","[daily, fasting, checkin, type, fast, water, j..."
4,17,8,intermittentfasting,new,2023-09-26 04:10:24,90 Days Intermittent Fasting - WORKS! Hi Every...,"[90, intermitt, fast, work, hi, everyon, 90th,...","[90, intermittent, fasting, work, hi, everyone..."


In [12]:
# Count the blank cells in both the "title_text_stemmed" and "title_text_lemmatized" columns
null_count4 = cleaned_df["title_text_stemmed"].isnull().sum()
null_count5 = cleaned_df["title_text_lemmatized"].isnull().sum()

# Display the count of blank cells
print("Number of blank cells in title_text_stemmed:", null_count4)
print("Number of blank cells in title_text_lemmatized:", null_count5)

Number of blank cells in title_text_stemmed: 0
Number of blank cells in title_text_lemmatized: 0


In [15]:
#Checking datatype after creation of cleaned_df

data_types3 = cleaned_df.dtypes

print(data_types3)

score                               int64
total_comments                      int64
subreddit                          object
post_type                          object
time_uploaded                      object
stopword_dropped_title_and_text    object
title_text_stemmed                 object
title_text_lemmatized              object
dtype: object
