# Data Collection

In [1]:
import requests
import time
import pandas as pd
from random import randint

In [2]:
url_1 = "https://www.reddit.com/r/depression.json"

In [3]:
headers = {"User-agent" : "Sam He"}
res = requests.get(url_1, headers=headers)
res.status_code

200

In [4]:
depress_json = res.json()
depress_json

{'data': {'after': 't3_jho0v7',
  'before': None,
  'children': [{'data': {'all_awardings': [{'award_sub_type': 'GROUP',
       'award_type': 'global',
       'awardings_required_to_grant_benefits': 3,
       'coin_price': 300,
       'coin_reward': 250,
       'count': 1,
       'days_of_drip_extension': 0,
       'days_of_premium': 0,
       'description': 'THIS right here! Join together to give multiple This awards and see the award evolve in its display and shower benefits for the recipient. For every 3 This awards given to a post or comment, the author will get 250 coins.',
       'end_date': None,
       'giver_coin_reward': None,
       'icon_format': None,
       'icon_height': 2048,
       'icon_url': 'https://i.redd.it/award_images/t5_22cerq/vu6om0xnb7e41_This.png',
       'icon_width': 2048,
       'id': 'award_68ba1ee3-9baf-4252-be52-b808c1e8bdc4',
       'is_enabled': True,
       'is_new': False,
       'name': 'This',
       'penny_donate': None,
       'penny_price': No

In [5]:
# function to scrape reddit page

def reddit_scrape(url_string, number_of_scrapes, output_list):
    after = None 
    for _ in range(number_of_scrapes):
        if _ == 0:
            print("SCRAPING {}\n--------------------------------------------------".format(url_string))
            print("<<<SCRAPING COMMENCED>>>") 
            print("Downloading Batch {} of {}...".format(1, number_of_scrapes))
        elif (_+1) % 5 ==0:
            print("Downloading Batch {} of {}...".format((_ + 1), number_of_scrapes))
        
        if after == None:
            params = {}
        else:
            params = {"after": after}             
        res = requests.get(url_string, params=params, headers=headers)
        if res.status_code == 200:
            the_json = res.json()
            output_list.extend(the_json["data"]["children"])
            after = the_json["data"]["after"]
        else:
            print(res.status_code)
            break
        time.sleep(randint(1,6))
    
    print("<<<SCRAPING COMPLETED>>>")
    print("Number of posts downloaded: {}".format(len(output_list)))
    print("Number of unique posts: {}".format(len(set([p["data"]["name"] for p in output_list]))))

In [None]:
depress_scraped = []
reddit_scrape("https://www.reddit.com/r/depression.json", 50, depress_scraped)

In [None]:
def create_unique_list(original_scrape_list, new_list_name):
    data_name_list=[]
    for i in range(len(original_scrape_list)):
        if original_scrape_list[i]["data"]["name"] not in data_name_list:
            new_list_name.append(original_scrape_list[i]["data"])
            data_name_list.append(original_scrape_list[i]["data"]["name"])
    print("LIST NOW CONTAINS {} UNIQUE SCRAPED POSTS".format(len(new_list_name)))

In [None]:
depress_scraped_unique = []
create_unique_list(depress_scraped, depress_scraped_unique)

In [None]:
depression = pd.DataFrame(depress_scraped_unique)
depression["is_suicide"] = 0
depression.head()

In [None]:
suicide_scraped = []
reddit_scrape("https://www.reddit.com/r/SuicideWatch.json", 50, suicide_scraped)

In [None]:
suicide_scraped_unique = []
create_unique_list(suicide_scraped, suicide_scraped_unique)

In [None]:
suicide_watch = pd.DataFrame(suicide_scraped_unique)
suicide_watch["is_suicide"] = 1
suicide_watch.head()

# Data Cleaning

In [None]:
!pip install wordninja
!pip install scattertext && python -m spacy.en.download

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
sns.set_style("darkgrid")

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

import wordninja

%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

In [None]:
depression = pd.read_csv('../data/depression.csv')
suicide_watch = pd.read_csv('../data/suicide_watch.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pd.set_option('display.max_columns', 500)
depression.head()

In [None]:
depression[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].head(3)

In [None]:
suicide_watch[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].head(3)

In [None]:
depression[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].shape

In [None]:
suicide_watch[["title", "selftext", "author",  "num_comments", "is_suicide","url"]].shape

In [None]:
print(depression["selftext"][118])
len(depression["selftext"][118])

In [None]:
print(suicide_watch["selftext"][118])
len(suicide_watch["selftext"][118])

In [None]:
dep_columns = depression[["title", "selftext", "author",  "num_comments", "is_suicide","url"]]
sui_columns = suicide_watch[["title", "selftext", "author",  "num_comments", "is_suicide","url"]]
combined_data = pd.concat([dep_columns,sui_columns],axis=0, ignore_index=True)    
combined_data

In [None]:
combined_data.to_csv('../data/combined_data.csv', index = False)

In [None]:
combined_data.info()

In [None]:
combined_data[combined_data["selftext"].isnull()].head(10)

In [None]:
combined_data["is_suicide"][combined_data["selftext"].isnull()].value_counts()

In [None]:
combined_data["selftext"].fillna("emptypost",inplace=True)

In [None]:
combined_data[combined_data["selftext"].isin(["emptypost"])].head()

In [None]:
combined_data.info()

In [None]:
def processing_text(series_to_process):
    new_list = []
    tokenizer = RegexpTokenizer(r'(\w+)')
    lemmatizer = WordNetLemmatizer()
    for i in range(len(series_to_process)):
        #TOKENISED ITEM(LONG STRING) IN A LIST
        dirty_string = (series_to_process)[i].lower()
        words_only = tokenizer.tokenize(dirty_string) #WORDS_ONLY IS A LIST THAT DOESN'T HAVE PUNCTUATION
        #LEMMATISE THE ITEMS IN WORDS_ONLY
        words_only_lem = [lemmatizer.lemmatize(i) for i in words_only]
        #REMOVING STOP WORDS FROM THE LEMMATIZED LIST
        words_without_stop = [i for i in words_only_lem if i not in stopwords.words("english")]
        #RETURN SEPERATED WORDS INTO LONG STRING
        long_string_clean = " ".join(word for word in words_without_stop)
        new_list.append(long_string_clean)
    return new_list

In [None]:
import nltk
nltk.download()

combined_data["selftext_clean"] = processing_text(combined_data["selftext"])
combined_data["title_clean"] = processing_text(combined_data["title"])
pd.set_option("display.max_colwidth", 100)
combined_data.head(8)

In [None]:
pd.set_option("display.max_colwidth", 1000)
combined_data[["selftext","selftext_clean"]].head(2)

In [None]:
pd.set_option("display.max_colwidth", 100)
combined_data[["title","title_clean"]].head(5)

In [None]:
author_test = []
for i in range(10):
    splits_list = wordninja.split(combined_data["author"][i])
    combined_string = " ".join(splits_list)
    author_test.append(combined_string)
test_dict = {combined_data["author"][i]:author_test[i] for i in range(10)}
print(test_dict)

In [None]:
def processing_author_names(series_to_process):
    author_split = []
    for i in range(len(series_to_process)):
        splits_list = wordninja.split(series_to_process[i])
        combined_string = " ".join(splits_list)
        author_split.append(combined_string)
    new_list = []
    tokenizer = RegexpTokenizer(r'(\w+)')
    lemmatizer = WordNetLemmatizer()
    for i in range(len(author_split)):
        #TOKENISED ITEM(LONG STRING) IN A LIST
        dirty_string = (author_split)[i].lower()
        words_only = tokenizer.tokenize(dirty_string) #WORDS_ONLY IS A LIST THAT DOESN'T HAVE PUNCTUATION
        #LEMMATISE THE ITEMS IN WORDS_ONLY
        words_only_lem = [lemmatizer.lemmatize(i) for i in words_only]
        #REMOVING STOP WORDS FROM THE LEMMATIZED LIST
        words_without_stop = [i for i in words_only_lem if i not in stopwords.words("english")]
        #RETURN SEPERATED WORDS INTO LONG STRING
        long_string_clean = " ".join(word for word in words_without_stop)
        new_list.append(long_string_clean)
    return new_list

In [None]:
combined_data["author_clean"]= processing_author_names(combined_data["author"])

#CHECKING ON author_clean
pd.set_option("display.max_colwidth", 100)
combined_data[["author","author_clean"]].tail(10)

In [None]:
combined_data.info()

In [None]:
combined_data.isnull().sum()

In [None]:
suicide_posts = combined_data[combined_data["is_suicide"] ==1]["selftext_clean"]
suicide_titles = combined_data[combined_data["is_suicide"] ==1]["title_clean"]
suicide_authors = combined_data[combined_data["is_suicide"] ==1]["author_clean"]

depression_posts = combined_data[combined_data["is_suicide"] ==0]["selftext_clean"]
depression_titles = combined_data[combined_data["is_suicide"] ==0]["title_clean"]
depression_authors = combined_data[combined_data["is_suicide"] ==0]["author_clean"]

In [None]:
#DEFINING A FUNCTION TO VISUALISE MOST USED WORDS
def plot_most_used_words(category_string, data_series, palette, image_mask):
    #CHECKING OUT COMMON WORDS IN r/SuicideWatch USING CVEC
    cvec = CountVectorizer(stop_words='english')
    cvec.fit(data_series)
    #CREATING A DATAFRAME OF EXTRACTED WORDS
    created_df = pd.DataFrame(cvec.transform(data_series).todense(),
                              columns=cvec.get_feature_names())
    total_words = created_df.sum(axis=0)
    
    #<<<WORDCLOUD>>>
    #CREATING A LONG STRING OF WORDS FOR THE WORD CLOUD MODULE
    top_40_words = total_words.sort_values(ascending = False).head(40)
    top_40_words_df = pd.DataFrame(top_40_words)
    top_words_cloud_df = top_40_words_df.reset_index()
    top_words_cloud_df.columns = ["words", "count"]
    one_string_list = []
    for i in range(len(top_words_cloud_df)):
        one_string = (top_words_cloud_df["words"][i] + " ")* top_words_cloud_df["count"][i]
        one_string_list.append(one_string)
    long_string = " ".join(string for string in one_string_list)
    #print(long_string)
    # CREATING A WORD CLOUD IMAGE
    mask = np.array(Image.open(image_mask))
    wordcloud = WordCloud(repeat=True, collocations=False,min_font_size=2, max_font_size= 80, max_words= 1000, background_color= "white",colormap= palette,  mask= mask).generate(long_string)
    # DISPLAY IT
    #image_colors = ImageColorGenerator(mask)
    #plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
    #plt.axis("off")
    plt.figure(figsize = (20, 5), dpi=300)
    plt.title('\nTop Words used in {}\n'.format(category_string), fontsize=22)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
    #<<<BARPLOT>>>
    #CREATING A FINAL DATAFRAME OF THE TOP 20 WORDS
    top_20_words = total_words.sort_values(ascending = False).head(20)
    top_20_words_df = pd.DataFrame(top_20_words, columns = ["count"])
    #PLOTTING THE COUNT OF THE TOP 20 WORDS
    sns.set_style("white")
    plt.figure(figsize = (15, 8), dpi=300)
    ax = sns.barplot(y= top_20_words_df.index, x="count", data=top_20_words_df, palette = palette)
    
    plt.xlabel("Count", fontsize=9)
    plt.ylabel('Common Words in {}'.format(category_string), fontsize=9)
    plt.yticks(rotation=-5)

In [None]:
combined_data["author_clean"].str.contains("throwaway").sum()

In [None]:
search_values = ["mr", "man", "boy", "guy", "dude"]
depression_authors.str.contains('|'.join(search_values)).sum()

In [None]:
search_values = ["mr", "man", "boy", "guy", "dude"]
suicide_authors.str.contains('|'.join(search_values)).sum()

In [None]:
search_values = ["mr", "man", "boy", "guy", "dude"]
combined_data["author_clean"].str.contains('|'.join(search_values)).sum()

In [None]:
search_values = ["ms", "woman", "girl", "gal", "lady"]
combined_data["author_clean"].str.contains('|'.join(search_values)).sum()

In [None]:
suicide_authors[suicide_authors.str.contains("girl")]

In [None]:
depression_authors[depression_authors.str.contains(r"\d")].count()

In [None]:
suicide_authors[suicide_authors.str.contains(r"\d")].count()

In [None]:
combined_data["author_clean"][combined_data["author_clean"].str.contains(r"420")]

In [None]:
len(combined_data["author_clean"])

In [None]:
combined_data["author"].value_counts().head(20)

In [None]:
pd.set_option("display.max_colwidth", 1000)
combined_data[["is_suicide","author","title", "selftext"]][combined_data["author"].isin(['Vivid-Smile', 'throaway8297338', 'snakesnack148', 'outakuslayer69',
       'enk9898', 'SQLwitch'])].sort_values("author")

In [None]:
pd.set_option("display.max_colwidth", 1000)
combined_data[["is_suicide","author","title", "selftext"]][combined_data["author"].isin(["[deleted]"])].sort_values("author")

In [None]:
df_author_counts=(pd.DataFrame(combined_data["author"].value_counts()))
df_author_counts.reset_index(level=0, inplace=True)
authors_posting_more_than_once = list(df_author_counts[df_author_counts["author"]>1]["index"])
authors_posting_more_than_once

In [None]:
pd.set_option("display.max_colwidth", 100)
more_than_once_mean_df = combined_data[combined_data["author"].isin(authors_posting_more_than_once)].groupby("author").mean()
more_than_once_mean_df.reset_index(level=0, inplace=True)
double_posters_mask_0 = ((more_than_once_mean_df["is_suicide"]) !=0) 
double_posters_mask_1 = ((more_than_once_mean_df["is_suicide"]) !=1.0) 
double_posters = more_than_once_mean_df[double_posters_mask_0][double_posters_mask_1].sort_values("num_comments", ascending=False)
print(len(double_posters))
top_double_posters_list= list(double_posters["author"].head(7))
top_double_posters_list

In [None]:
pd.set_option("display.max_colwidth", 1000)
combined_data[["is_suicide","author","title", "selftext","url"]][combined_data["author"].isin(top_double_posters_list)].sort_values("author")

In [None]:
combined_data["selftext_length"]= [len(combined_data["selftext"][i]) for i in range(len(combined_data))]

In [None]:
combined_data["title_length"]= [len(combined_data["title"][i]) for i in range(len(combined_data))]

In [None]:
ave_length_dep_title = combined_data["title_length"][combined_data["is_suicide"] ==0].mean()
ave_length_sui_title = combined_data["title_length"][combined_data["is_suicide"] ==1].mean()
ave_length_dep_post = combined_data["selftext_length"][combined_data["is_suicide"] ==0].mean()
ave_length_sui_post = combined_data["selftext_length"][combined_data["is_suicide"] ==1].mean()

print("Average length of a r/depression title: {}".format(ave_length_dep_title))
print("Average length of a r/SuicideWatch title: {}".format(ave_length_sui_title))
print("Average length of a r/depression post: {}".format(ave_length_dep_post))
print("Average length of a r/SuicideWatch post: {}".format(ave_length_sui_post))

In [None]:
sns.set_style("white")
plt.figure(figsize = (18, 12))
sns.scatterplot(data =combined_data,
               y = "selftext_length", 
               x = "author",
               hue = 'is_suicide', 
               palette = "magma_r",
               size = 'selftext_length',
               sizes=(20, 150));
plt.title("Length of Posts");
plt.xlabel("Authors");
plt.ylabel("Number of words");
plt.xticks(rotation=65);

In [None]:
combined_data["megatext_clean"]=combined_data["author_clean"] + " " + combined_data["selftext_clean"]+ " " +combined_data["title_clean"]

In [None]:
scatter_data = combined_data[["megatext_clean", "is_suicide"]]
scatter_data["category"] = scatter_data["is_suicide"].map({0: "Depression", 1: "Suicide"})
scatter_data.tail()

In [None]:
nlp = st.whitespace_nlp_with_sentences
scatter_data.groupby("category").apply(lambda x: x.megatext_clean.apply(lambda x: len(x.split())).sum())
scatter_data['parsed'] = scatter_data.megatext_clean.apply(nlp)
scatter_data.tail()

In [None]:
corpus = st.CorpusFromParsedDocuments(scatter_data, category_col="category", parsed_col="parsed").build()

In [None]:
combined_data.info()