In [1]:
import praw
import json
import pandas as pd
import os
import glob
import numpy as np

#### Getting started with PRAW
Before praw can be used to scrape data, it needs us to authenticate ourselves. To do this, we need to create a read-only overall Reddit 'instance' and provide it with three pieces of information: your client_id , client_secret, and user_agent (plus your username and password).

In [4]:
#create a (general) reddit instance
reddit = praw.Reddit(
    client_id="****",
    client_secret="****",
    user_agent="****",
    username='****',
    password='****!')

### Creating a Subreddit 'Instance'

To begin, we must create a subreddit 'instance' (similar to when we created the more general Reddit 'instance') by defining which subreddit we want to scrape. Here, I only care about the AITA subreddit, so I'll create a single instance named `aita`.

In [6]:
#creating my subreddit "instance"
aita = reddit.subreddit('AmItheAsshole')

### Before you Scrape

1. **Be aware of limits**: Although you can set `limit=None`, the maximum is actually 1000, which is a limitation set by Reddit directly. According to PRAW, you can try to get more than 1,000 by using the search function. 

2. **Pick your sub-category**: For each Subreddit, different categories of posts can be selectively scraped: best, hot, new, controversial, top, and rising. Here, I am only interested in collecting two categories that we will scrape on a monthly basis: new and top.

3. **Prepare for inconsistencies**: PRAW docs notes that you may see a discrepancy between what praw returns and what is actually viewable on the reddit page, because it counts deleted, removed, and spam comments.

4. **Data Handling**: If you're saving your data in a as a .JSON object, be sure to cast your "Redditor" objects as strings (e.g., do str(comment.author) for example) because otherwise, JSON doesn't know what that object is and it'll break, losing all that precious scraped data!

In [6]:
out_path ='/Users/f004p74/Documents/dartmouth/projects/conflict-taxonomy/aita-posts/'

In [29]:
aita_top = aita.top(limit=1000)

In [30]:
#we ignore the AutoModerator because it's unreleated to the post's content
skip_list = ['AutoModerator']

for submission in aita_top:
    praw_dict = {}
    submission_id = submission.id
    
    praw_dict["Submission"] = {'Title': submission.title,
                                'Sub ID': submission.id,
                               'URL': submission.url,
                              'Body': submission.selftext}
    
    with open(out_path+submission_id+'_top.json', 'w') as fp:
        json.dump(praw_dict, fp)

In [27]:
aita_hot = aita.hot(limit=1000)

In [28]:
#we ignore the AutoModerator because it's unreleated to the post's content
skip_list = ['AutoModerator']

for submission in aita_hot:
    praw_dict = {}
    submission_id = submission.id
    
    praw_dict["Submission"] = {'Title': submission.title,
                                'Sub ID': submission.id,
                               'URL': submission.url,
                              'Body': submission.selftext}
    
    with open(out_path+submission_id+'_hot.json', 'w') as fp:
        json.dump(praw_dict, fp)

In [7]:
file_list = os.listdir(out_path)

In [8]:
data = []

for file in file_list:
    try:
        with open(out_path+file) as f:
            json_dict = json.load(f)

        data.append(json_dict["Submission"]['Body'])
    except:
        continue

### Text Processing & Analysis

In [2]:
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import FreqDist
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
import gensim
from gensim.utils import simple_preprocess

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/f004p74/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/f004p74/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/f004p74/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
contractions = { "ain't": "am not","aren't": "are not","can't": "cannot",
"can't've": "cannot have","'cause": "because","could've": "could have","couldn't": "could not",
"couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not",
"hadn't": "had not","hadn't've": "had not have","hasn't": "has not","haven't": "have not",
"he'd": "he would","he'd've": "he would have","he'll": "he will","he's": "he is",
"how'd": "how did","how'll": "how will","how's": "how is","i'd": "i would",
"i'll": "i will","i'm": "i am","i've": "i have","isn't": "is not","it'd": "it would",
"it'll": "it will","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
"might've": "might have","mightn't": "might not","must've": "must have","mustn't": "must not",
"needn't": "need not","oughtn't": "ought not","shan't": "shall not","sha'n't": "shall not",
"she'd": "she would","she'll": "she will","she's": "she is","should've": "should have",
"shouldn't": "should not","that'd": "that would","that's": "that is","there'd": "there had",
"there's": "there is","they'd": "they would","they'll": "they will","they're": "they are",
"they've": "they have","wasn't": "was not","we'd": "we would","we'll": "we will","we're": "we are",
"we've": "we have","weren't": "were not","what'll": "what will","what're": "what are",
"what's": "what is","what've": "what have","where'd": "where did","where's": "where is","who'll": "who will",
"who's": "who is","won't": "will not","wouldn't": "would not","you'd": "you would",
"you'll": "you will","you're": "you are"}

stopwords_list = stopwords.words('english')
stopwords_list.extend(['aita', 'asshole', 'reddit','subreddit','aitah','post',
                       'poster','link','original','lurker'])

In [9]:
clean_list = []

for i in data:
    text = i.lower() # convert all text to lowercase
    text = text.split() #separates the block of text to individual words
    new_text = []
    for word in text: # converts contractions to separate words
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    
    
    text = " ".join(new_text)
    
    # Remove special characters and punctuation
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\\', ' ', text)
    #text = re.sub(r'\'', ' ', text) 

    # Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)

    # Lemmatize each word
    text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token)>1]
    
    # Remove stop words
    text = [x for x in text if x not in stopwords_list]
    
    # Convert the list back into a string.
    text = ' '.join(map(str, text))
    
    clean_list.append(text)
    

In [10]:
all_words = []
for i in clean_list:
    separated = i.split()
    for word in separated:
        all_words.append(word)

In [None]:
#separating out different categories:

#social labels
relationships = ['family','mom','dad','grandma','grandpa','grandmother','grandfather',
                'aunt','uncle','cousin','sister','brother','girlfriend','boyfriend',
                'fiancee','husband','wife','partner','colleague','boss','manager','coworker',
                'teammate','classmate','mother','father','daughter','son','baby','child',
                'parent','kid','friend','buddy']

#emotion words
states = pd.read_csv("/Users/f004p74/Documents/dartmouth/projects/c-tom-reddit/mental_states.csv")
traits = pd.read_csv("/Users/f004p74/Documents/dartmouth/projects/c-tom-reddit/mental_traits.csv")

state_list = list(states["States"])
trait_list = list(traits["Traits"])

### word frequencies

In [11]:
FreqDist(all_words).most_common(100)

[('say', 5220),
 ('get', 4823),
 ('tell', 3905),
 ('go', 3883),
 ('would', 2776),
 ('want', 2764),
 ('like', 2647),
 ('make', 2372),
 ('ask', 2316),
 ('time', 2152),
 ('know', 2080),
 ('think', 1900),
 ('take', 1830),
 ('one', 1678),
 ('call', 1653),
 ('come', 1634),
 ('family', 1610),
 ('work', 1461),
 ('mom', 1417),
 ('start', 1389),
 ('could', 1377),
 ('leave', 1371),
 ('even', 1306),
 ('really', 1279),
 ('try', 1278),
 ('kid', 1277),
 ('back', 1263),
 ('feel', 1239),
 ('husband', 1214),
 ('parent', 1199),
 ('since', 1195),
 ('talk', 1169),
 ('give', 1165),
 ('sister', 1159),
 ('also', 1158),
 ('home', 1110),
 ('wife', 1072),
 ('see', 1071),
 ('us', 1066),
 ('house', 1061),
 ('need', 1031),
 ('years', 979),
 ('never', 956),
 ('live', 955),
 ('people', 942),
 ('keep', 935),
 ('day', 920),
 ('friends', 916),
 ('dad', 916),
 ('pay', 903),
 ('look', 897),
 ('still', 892),
 ('daughter', 873),
 ('friend', 851),
 ('things', 836),
 ('year', 826),
 ('much', 810),
 ('well', 805),
 ('brother',

In [7]:
import gensim.corpora as corpora

id2word = gensim.corpora.Dictionary()


NameError: name 'gensim' is not defined

In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [50]:
# Create Corpus: Term Document Frequency
dataset = [d.split() for d in clean_list]
id2word = gensim.corpora.Dictionary(dataset)

corpus = []
for text in dataset:
    new_text = id2word.doc2bow(text)
    corpus.append(new_text)

In [51]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           iterations=50,
                                           num_topics=3)

In [55]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

ModuleNotFoundError: No module named 'pyLDAvis'

In [42]:
from gensim.models import CoherenceModel

# Compute Coherence Score
number_of_topics = []
coherence_score = []
for i in range(1,6):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           iterations=50,
                                           num_topics=i)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=reviews['Review_Clean_List'], dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    number_of_topics.append(i)
    coherence_score.append(coherence_lda);

NameError: name 'reviews' is not defined

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

In [None]:
#import pyLDAvis
# import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis
    