In [97]:
import numpy as np
import pandas as pd
import requests
from datetime import datetime

In [98]:
with open('pass.txt', 'r') as f:
    lines = f.readlines()

pwd = lines[0].strip()
CLIENT_ID = lines[1].strip()
SECRET_ID = lines[2].strip()

In [99]:
data = {
    'grant_type': 'password',
    'username': 'eJechtion4',
    'password': pwd
}
headers = {"User-Agent": "MyAPI/1.0.0"}
auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_ID)

res = requests.post('https://www.reddit.com/api/v1/access_token',
auth = auth, data = data, headers = headers)

TOKEN = res.json()["access_token"]
headers["Authorization"] = f'bearer {TOKEN}'


In [100]:
res = requests.get("https://oauth.reddit.com/r/AskReddit/hot", headers = headers, params= {'limit': '100'})

In [101]:
# Extract relevant data from API response
data = []
for post in res.json()["data"]["children"]:
    post_date = datetime.utcfromtimestamp(post["data"]["created_utc"]).strftime('%Y-%m-%d %H:%M:%S')
    data.append({
        "post_id": post["data"]["id"],
        "post_title": post["data"]["title"],
        "post_text": post["data"]["selftext"],
        "post_date": post_date,
        "author": post["data"]["author"],
        "subreddit": post["data"]["subreddit"],
        "upvotes": post["data"]["ups"],
        "downvotes": post["data"]["downs"],
        "num_comments": post["data"]["num_comments"]
    })

# Create pandas DataFrame from extracted data
df = pd.DataFrame(data)

In [102]:
# Extract relevant data from API response
data_hot = []
for post_hot in res_hot.json()["data"]["children"]:
    post_date = datetime.utcfromtimestamp(post["data"]["created_utc"]).strftime('%Y-%m-%d %H:%M:%S')
    data.append({
        "post_id": post["data"]["id"],
        "post_title": post["data"]["title"],
        "post_text": post["data"]["selftext"],
        "post_date": post_date,
        "author": post["data"]["author"],
        "subreddit": post["data"]["subreddit"],
        "upvotes": post["data"]["ups"],
        "downvotes": post["data"]["downs"],
        "num_comments": post["data"]["num_comments"]
    })

# Create pandas DataFrame from extracted data
df = pd.DataFrame(data)

In [103]:
df.drop_duplicates(subset=['post_title', 'post_text'], inplace=True) #drop duplicates


In [104]:
df

Unnamed: 0,post_id,post_title,post_text,post_date,author,subreddit,upvotes,downvotes,num_comments
0,11kyi0b,Which one of these charities should AskReddit ...,It's time for the community to vote on which c...,2023-03-07 12:49:22,-eDgAR-,AskReddit,55,0,6
1,11m0jsp,What Instantly Ruins A Burger For You?,,2023-03-08 16:19:54,TheKeyMaster365,AskReddit,7703,0,13401
2,11ly1b6,(Serious) what’s something that mentally and/o...,,2023-03-08 14:40:28,Vanguard2002,AskReddit,11099,0,8837
3,11lynyj,What profession do you find unhealthy?,,2023-03-08 15:06:19,kk-sahinul,AskReddit,1497,0,2311
4,11lcyer,How did somebody you know ruin their life in o...,,2023-03-07 22:05:08,TunaSaladWithBeans,AskReddit,22277,0,11656
...,...,...,...,...,...,...,...,...,...
96,11m9qpr,What is working data entry like?,,2023-03-08 22:07:09,Independent_Grab_200,AskReddit,5,0,6
97,11lxpb6,What are some important things to know when ge...,,2023-03-08 14:25:32,EnableInsanity,AskReddit,14,0,46
98,11m9gey,"Firefighters or police officers, what's your h...",,2023-03-08 21:56:11,Galaxy_Wolf_16,AskReddit,3,0,2
99,11m5z5s,What do you think about people who constantly ...,,2023-03-08 19:43:56,SuspiciousPaperclip,AskReddit,5,0,17


# NLP Start


In [105]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [107]:
# define stop words
stop_words = set(stopwords.words('english'))

# instantiate stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# define preprocessing pipeline
def preprocess_text(text):
    # tokenize text
    tokens = word_tokenize(text)

    # remove stop words
    tokens = [token for token in tokens if token not in stop_words]

    # stem and lemmatize tokens
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]

    # return preprocessed text as a string
    return " ".join(lemmatized_tokens)




In [108]:
# apply preprocessing pipeline to text data
df["preprocessed_text"] = df["post_title"].apply(preprocess_text)

In [109]:
df["preprocessed_text"] # check if stemming worked

0      which one chariti askreddit support 15th anniv...
1                 what instantli ruin a burger for you ?
2         ( seriou ) ’ someth mental and/or emot broke ?
3                          what profess find unhealthi ?
4                  how somebodi know ruin life one day ?
                             ...                        
96                           what work data entri like ?
97             what import thing know get first tattoo ?
98             firefight polic offic , 's horror stori ?
99            what think peopl constantli use buzzword ?
100                         what last thing make cring ?
Name: preprocessed_text, Length: 101, dtype: object

# Vectorize data


In [110]:
from sklearn.feature_extraction.text import CountVectorizer

# create a CountVectorizer object
vectorizer = CountVectorizer(stop_words='english', max_features=1000)

# fit and transform the preprocessed_text column
X = vectorizer.fit_transform(df["preprocessed_text"])

# create a new dataframe with the countvectorizer output
cv_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# concatenate the cv_df dataframe with the original dataframe
df_final = pd.concat([df, cv_df], axis=1)



In [111]:
df_final

Unnamed: 0,post_id,post_title,post_text,post_date,author,subreddit,upvotes,downvotes,num_comments,preprocessed_text,...,way,weird,weirdest,whi,wish,woman,work,worker,world,worst
0,11kyi0b,Which one of these charities should AskReddit ...,It's time for the community to vote on which c...,2023-03-07 12:49:22,-eDgAR-,AskReddit,55,0,6,which one chariti askreddit support 15th anniv...,...,0,0,0,0,0,0,0,0,0,0
1,11m0jsp,What Instantly Ruins A Burger For You?,,2023-03-08 16:19:54,TheKeyMaster365,AskReddit,7703,0,13401,what instantli ruin a burger for you ?,...,0,0,0,0,0,0,0,0,0,0
2,11ly1b6,(Serious) what’s something that mentally and/o...,,2023-03-08 14:40:28,Vanguard2002,AskReddit,11099,0,8837,( seriou ) ’ someth mental and/or emot broke ?,...,0,0,0,0,0,0,0,0,0,0
3,11lynyj,What profession do you find unhealthy?,,2023-03-08 15:06:19,kk-sahinul,AskReddit,1497,0,2311,what profess find unhealthi ?,...,0,0,0,0,0,0,0,0,0,0
4,11lcyer,How did somebody you know ruin their life in o...,,2023-03-07 22:05:08,TunaSaladWithBeans,AskReddit,22277,0,11656,how somebodi know ruin life one day ?,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,11m9qpr,What is working data entry like?,,2023-03-08 22:07:09,Independent_Grab_200,AskReddit,5,0,6,what work data entri like ?,...,0,0,0,0,0,0,1,0,0,0
97,11lxpb6,What are some important things to know when ge...,,2023-03-08 14:25:32,EnableInsanity,AskReddit,14,0,46,what import thing know get first tattoo ?,...,0,0,0,0,0,0,0,0,0,0
98,11m9gey,"Firefighters or police officers, what's your h...",,2023-03-08 21:56:11,Galaxy_Wolf_16,AskReddit,3,0,2,"firefight polic offic , 's horror stori ?",...,0,0,0,0,0,0,0,0,0,0
99,11m5z5s,What do you think about people who constantly ...,,2023-03-08 19:43:56,SuspiciousPaperclip,AskReddit,5,0,17,what think peopl constantli use buzzword ?,...,0,0,0,0,0,0,0,0,0,0
