[Click here to see the notebook used to scrap the data](Project%203%20Cleaning%20(caa%20250923%202018).ipynb)

In [1]:
# Standard Library Imports
import pandas as pd
import re
import datetime

# Third-party Library Imports
import requests
from bs4 import BeautifulSoup
import praw
import nltk
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import time
import itertools
from collections import defaultdict
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.util import bigrams

# Custom Functions or Classes (if applicable)

In [5]:
#Opening the newly created csv file

reddit_no_null = pd.read_csv('reddit_no_empty_rows.csv')

#Checking the columns for the primary datatype
data_types = reddit_no_null.dtypes

print(data_types)

unique_subreddits = reddit_no_null['Subreddit'].unique()
unique_post_types = reddit_no_null['Post Type'].unique()

print(unique_subreddits)
print(unique_post_types)

reddit_no_null

Title              object
Post Text          object
ID                 object
Score               int64
Total Comments      int64
Post URL           object
Subreddit          object
Post Type          object
Time uploaded     float64
dtype: object
['intermittentfasting' 'AnorexiaNervosa']
['hot' 'new' 'top']


Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL,Subreddit,Post Type,Time uploaded
0,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",16rk06v,3,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,hot,1.695622e+09
1,I did my first pullup,"After losing 15kg this year (33 pounds), I can...",16rn8qu,80,11,https://v.redd.it/ajlxmiz2edqb1,intermittentfasting,hot,1.695634e+09
2,"Yeah, I'm proud of myself for this. 30lbs gone...","First picture is from today, second picture fr...",16rl3su,76,6,https://www.reddit.com/gallery/16rl3su,intermittentfasting,hot,1.695626e+09
3,Face gains - March>Sept,5'7 - SW 255 CW217 GW 175 - started 18:6 now o...,16rg4uf,125,7,https://i.redd.it/pmos097ncbqb1.jpg,intermittentfasting,hot,1.695609e+09
4,Week 21 Accountability Update: Rumors of my de...,Sex: Male\n\nAge: 45\n\nStart date: 5-1-2023\n...,16roort,14,2,https://i.redd.it/eb2t0r8dtdqb1.jpg,intermittentfasting,hot,1.695639e+09
...,...,...,...,...,...,...,...,...,...
2483,anyone else have a fear of becoming obese?,i know that fear of gaining weight is a pretty...,kwdigq,87,24,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.610532e+09
2484,I am so alone.,I’m a 16 year old male who’s been suffering fr...,kq3m1u,87,15,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.609744e+09
2485,Chocolate Ice Cream!,"Ok, so it's not a lot, and I didn't use to be ...",kc3nau,87,16,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.607831e+09
2486,do you experience oDdLy specific challenges RI...,Do y’all find that as SOON as you get to a goo...,k4lpsn,85,17,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.606832e+09


In [6]:
#Lowercasing the header

reddit_no_null.columns = reddit_no_null.columns.str.lower()
reddit_no_null

Unnamed: 0,title,post text,id,score,total comments,post url,subreddit,post type,time uploaded
0,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",16rk06v,3,0,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,hot,1.695622e+09
1,I did my first pullup,"After losing 15kg this year (33 pounds), I can...",16rn8qu,80,11,https://v.redd.it/ajlxmiz2edqb1,intermittentfasting,hot,1.695634e+09
2,"Yeah, I'm proud of myself for this. 30lbs gone...","First picture is from today, second picture fr...",16rl3su,76,6,https://www.reddit.com/gallery/16rl3su,intermittentfasting,hot,1.695626e+09
3,Face gains - March>Sept,5'7 - SW 255 CW217 GW 175 - started 18:6 now o...,16rg4uf,125,7,https://i.redd.it/pmos097ncbqb1.jpg,intermittentfasting,hot,1.695609e+09
4,Week 21 Accountability Update: Rumors of my de...,Sex: Male\n\nAge: 45\n\nStart date: 5-1-2023\n...,16roort,14,2,https://i.redd.it/eb2t0r8dtdqb1.jpg,intermittentfasting,hot,1.695639e+09
...,...,...,...,...,...,...,...,...,...
2483,anyone else have a fear of becoming obese?,i know that fear of gaining weight is a pretty...,kwdigq,87,24,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.610532e+09
2484,I am so alone.,I’m a 16 year old male who’s been suffering fr...,kq3m1u,87,15,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.609744e+09
2485,Chocolate Ice Cream!,"Ok, so it's not a lot, and I didn't use to be ...",kc3nau,87,16,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.607831e+09
2486,do you experience oDdLy specific challenges RI...,Do y’all find that as SOON as you get to a goo...,k4lpsn,85,17,https://www.reddit.com/r/AnorexiaNervosa/comme...,AnorexiaNervosa,top,1.606832e+09


In this case, only the title, post text, the subreddit columns, and the time uploaded, were kept for analysis.

In [7]:
# Create a new DataFrame with the selected columns
selected_columns = ["title", "post text", "subreddit", "time uploaded"]
less_columns_df = reddit_no_null[selected_columns]
# Save the data to a CSV file
less_columns_df.to_csv("reddit_posts (filtered columns).csv", index=False)
less_columns_df

Unnamed: 0,title,post text,subreddit,time uploaded
0,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",intermittentfasting,1.695622e+09
1,I did my first pullup,"After losing 15kg this year (33 pounds), I can...",intermittentfasting,1.695634e+09
2,"Yeah, I'm proud of myself for this. 30lbs gone...","First picture is from today, second picture fr...",intermittentfasting,1.695626e+09
3,Face gains - March>Sept,5'7 - SW 255 CW217 GW 175 - started 18:6 now o...,intermittentfasting,1.695609e+09
4,Week 21 Accountability Update: Rumors of my de...,Sex: Male\n\nAge: 45\n\nStart date: 5-1-2023\n...,intermittentfasting,1.695639e+09
...,...,...,...,...
2483,anyone else have a fear of becoming obese?,i know that fear of gaining weight is a pretty...,AnorexiaNervosa,1.610532e+09
2484,I am so alone.,I’m a 16 year old male who’s been suffering fr...,AnorexiaNervosa,1.609744e+09
2485,Chocolate Ice Cream!,"Ok, so it's not a lot, and I didn't use to be ...",AnorexiaNervosa,1.607831e+09
2486,do you experience oDdLy specific challenges RI...,Do y’all find that as SOON as you get to a goo...,AnorexiaNervosa,1.606832e+09


In [8]:
#nltk.download("stopwords")
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt')
#Please uncomment the above if you haven't downloaded these libraries.

In [11]:
# Get the list of English stopwords
stop_words = set(stopwords.words("english"))

# Function to filter out stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Apply the function to the "Title" and "Post Text" columns using .loc
# change all the long-names
less_columns_df.loc[:, "stopword_dropped_title"] = less_columns_df["title"].apply(remove_stopwords)
less_columns_df.loc[:, "stopword_dropped_post_text"] = less_columns_df["post text"].apply(remove_stopwords)

# Create a clean copy of the DataFrame with the dropped columns
no_stopwords_df = less_columns_df.drop(columns=["title", "post text"]).copy()

# Display the DataFrame with the dropped columns
no_stopwords_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  less_columns_df.loc[:, "stopword_dropped_title"] = less_columns_df["title"].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  less_columns_df.loc[:, "stopword_dropped_post_text"] = less_columns_df["post text"].apply(remove_stopwords)


Unnamed: 0,subreddit,time uploaded,stopword_dropped_title,stopword_dropped_post_text
0,intermittentfasting,1.695622e+09,Daily Fasting Check-in!,"* **Type** fast (water, juice, smoking, etc.) ..."
1,intermittentfasting,1.695634e+09,first pullup,"losing 15kg year (33 pounds), finally pull bod..."
2,intermittentfasting,1.695626e+09,"Yeah, I'm proud this. 30lbs gone since July!","First picture today, second picture July!"
3,intermittentfasting,1.695609e+09,Face gains - March>Sept,5'7 - SW 255 CW217 GW 175 - started 18:6 omad ...
4,intermittentfasting,1.695639e+09,Week 21 Accountability Update: Rumors demise g...,Sex: Male Age: 45 Start date: 5-1-2023 Start w...
...,...,...,...,...
2483,AnorexiaNervosa,1.610532e+09,anyone else fear becoming obese?,know fear gaining weight pretty much universal...
2484,AnorexiaNervosa,1.609744e+09,alone.,I’m 16 year old male who’s suffering anorexia ...
2485,AnorexiaNervosa,1.607831e+09,Chocolate Ice Cream!,"Ok, lot, use scared liquid calories (I count i..."
2486,AnorexiaNervosa,1.606832e+09,experience oDdLy specific challenges RIGHT rec...,y’all find SOON get good place recovery someth...
