Text Analysis
==
This project is a text analysis of the r/survivor subreddit's opinion on each Survivor season. The goal of this visualization is to identify main descriptors for each season. Text analysis inspired by [github/walkerkq](https://github.com/walkerkq/textmining_southpark/)

All data sourced from [r/survivor](https://www.reddit.com/r/survivor/)

In [8]:
import praw
import re
import pandas as pd
from praw.models import MoreComments
import numpy as np
from datetime import datetime, timezone
import json

# Track date of data pull
pull_date = datetime.now().strftime("%b %d %Y %H:%M:%S")

# Create connection to Reddit via PRAW
fp = open('./settings.json')
settings = json.load(fp).get('praw')
reddit = praw.Reddit(client_id = settings.get('client_id'),
                    client_secret = settings.get('client_secret'),
                    user_agent = settings.get('user_agent'),
                    username = settings.get('username'),
                    password = settings.get('password'))
subreddit = reddit.subreddit('survivor')

fp = open('./seasons.json')
seasons = json.load(fp)

processTopPosts = False

In [9]:
# Get top 100 submissions on each season
top_submissions = {}
if(processTopPosts):
    for (nbr, season) in seasons.items():
        submissions = subreddit.search("flair:"+season.replace("\n"," "),sort='top')
        for sub in submissions:
            top_submissions[sub.id] = nbr

# Unspoiled posts
wssyw_ids = ['10tpq9', # e.g. S06: The Amazon, hyperlinked
             '26viy4', # e.g. Season 28: Cagayan
             '37c7zu', # e.g. S15: China
             '3xp433', 
             '4kzd4g', 
             '5jwzib', 
             '6ga0ty', # [Season 7: Pearl Islands](link to countdown)
             '8p0ye9', 
             'btu8iu'  # Note: Still posting countdown
            ]
wssyw_spoilers = {'1':'c48snf',
                  '2':'c0mplt',
                  '3':'c1c9cs',
                  '4':'c1r9tn',
                  '5':'bwr435',
                  '6':'c3slsk',
                  '7':'c8c3ov',
                  '8':'bx4u7z',
                  '9':'c2jn8g',
                  '10':'c24mgl',
                  '11':'c0ep8t',
                  '12':'c5xgnr',
                  '13':'c12yug',
                  '14':'by9sit',
                  '15':'c8bzdu',
                  '16':'c6b0ce',
                  '17':'c5pfk8',
                  '18':'c8c1g8',
                  '19':'bymej8',
                  '20':'c82s4d',
                  '21':'bztmw5',
                  '22':'bv9dna',
                  '23':'bzequv',
                  '24':'bwee06',
                  '25':'c6bmtt'}

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

In [None]:
# Create dataframe of all comments
def get_reply_comments(submission):
    d = []
    for top_level_comment in submission.comments:
        if (not top_level_comment.is_submitter):
            continue    
        season = None
        season_name = None
        match1 = re.search("(\d+:( (\w|-)+|\.+){1,})",top_level_comment.body)
        match2 = re.search("(:(\d+)( (\w|-)+|\.+){1,})",top_level_comment.body) # Data error of S:00 Example Name
        if match1:
            season = int(match1.group(0).split(': ')[0])
            season_name = seasons[str(season)]
        if match2:
            season = int(match2.group().split(':')[1].split(' ')[0])
            season_name = seasons[str(season)]
        for comment in top_level_comment.replies.list():
            d.append({'season_nbr':season,'season':season_name,'comment_id':comment.id,'comment':comment.body,'score':comment.score})
    return d

# Get comments for each WSSYW Countdown submission
def getComments(r_id, top_level_comment=False, season_key=0):
    d = []
    submission = reddit.submission(id=r_id)
    submission.comments.replace_more(limit=2500, threshold=0)
    if top_level_comment:
        [d.append(row) for row in get_reply_comments(submission)]
    else:
        for comment in submission.comments.list():
            d.append({'season_nbr':int(season_key),'season':seasons[season_key],'comment_id':comment.id,'comment':comment.body,'score':comment.score})
    return d

In [None]:
#### LARGE DATA SET ####
if (processTopPosts):
    d_all = []
    for (r_id,season_nbr) in top_submissions.items():
        [d_all.append(row) for row in getComments(r_id,False,season_nbr)]
    df_raw_all = pd.DataFrame(d_all)

    # Save csv for reference
    df_raw_all.to_csv('top_post_comments.csv',index = None, header=True)

#### UNSPOILED DATA ####
d=[]
for r_id in wssyw_ids:
    [d.append(row) for row in getComments(r_id, True)]

df_raw_wssyw = pd.DataFrame(d)

# Save csv for reference
df_raw_wssyw.to_csv('unspoiled_comments.csv',index = None, header=True)

In [10]:
df_raw = pd.read_csv("unspoiled_comments.csv")
df_raw_clean = df_raw[df_raw['score']>0]

# df_raw = df_raw_sp.append(df_raw_wssyw).sort_values('season_nbr').reindex()
# df_raw = df_raw_wssyw

In [271]:
from nltk.tokenize import sent_tokenize
from collections import OrderedDict

s = "'' think deserve mention breath ri caramoan ."
t = sent_tokenize(s)
seen = set()
print([len(word_tokenize(sent)) for sent in t])
print( [x for x in t if (x not in seen and not seen.add(x)) or (len(word_tokenize(x))<4)])

[4, 3, 4, 3, 3, 2, 2, 2]
['testing the best .', 'testing one .', 'testing the best.', 'testing one.', 'testing twice.', 'test.', 'test.', 'test.']


In [338]:
print([word for word in stop if word not in ['not', 'dont']])

['i', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 

In [272]:
stop = ['i', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'a', 'an', 'the', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',  'some', 'such',  's', 't', 'will', 'just', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain']
from textblob import Word
from collections import OrderedDict
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.tokenize import sent_tokenize

fp = open('./contractions.json')
contractions = json.load(fp)
lem = WordNetLemmatizer() 

def get_wordnet_pos(sent):
    """Map POS tag to first character lemmatize() accepts"""
    sent_pos = []
    tags = pos_tag(word_tokenize(sent))
    for tag in tags:
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        sent_pos += [tag_dict.get(tag[1][0], wordnet.NOUN)]
    return sent_pos

def remove_copy_paste(x):
    sentences = sent_tokenize(x)
    seen = set()
    return " ".join([sent for sent in sentences if (sent not in seen and not seen.add(sent)) or (len(word_tokenize(sent))<8)])
def to_lower(x):
    return " ".join(x.lower() for x in x.split())
def remove_stop(x):
    return " ".join(word for word in x.split() if word not in stop)
def lem_words(x):
    contr = " ".join([contractions.get(word.replace("'",''), word.replace("'",'')) for word in x.split()])
    pos = get_wordnet_pos(contr)
    lems = []
    for w,p in zip(word_tokenize(contr), pos):
        lems+=[lem.lemmatize(w, p)]
    return " ".join(lems)

df_clean_tmp = df_raw_clean
col = df_raw_clean['comment'].apply(lambda x: to_lower(x)).apply(lambda x: lem_words(x)).str.replace('[^\w\s]|[\d]','').apply(lambda x: remove_stop(x))
df_clean_tmp = df_raw_clean.assign(comment_clean=col.values)

In [273]:
df_comb = pd.DataFrame(columns = ['comment'])

df_comb['comment'] = df_clean_tmp.groupby('season_nbr')['comment_clean'].apply(lambda x: " ".join(x))
df_comb['season_nbr'] = df_comb.index

df_clean = df_comb

In [270]:
df_comb.iloc[23,0]



In [252]:
import spacy
from spacy.lang.en import English

from nltk import ne_chunk, pos_tag, word_tokenize

def getPropNouns(data):
    chunks = ne_chunk(pos_tag(word_tokenize(data)))
    nouns = []
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            if(chunk.label()=="PERSON"):
                nouns+=[chunk[0][0].lower()]
    return nouns

def getIgnoreWords(data):
    tags = pos_tag(word_tokenize(data))
    tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV}
    words = []
    for tag in tags:
        if (tag_dict.get(tag[1][0], wordnet.NOUN) != wordnet.NOUN):
           words+=[tag[0].lower()]
    return words
        
non_nouns = getIgnoreWords(" ".join(df_raw['comment']))

propNounsTemp = {}
df_raw['entity'] = df_raw['comment'].apply(lambda x: getPropNouns(x))
for row in df_raw['entity']:
    for item in row:
        propNounsTemp[item] = propNounsTemp.get(item, 0) + 1

In [253]:
propNouns=[]
for (key, count) in propNounsTemp.items():
    if count > 3:
        propNouns+=[key]
# print([word for word in propNouns if pos_tag([word])[0][1][0] in ['N']])
propNouns = [word for word in propNouns if word not in non_nouns]

In [254]:
rare_words = pd.Series(' '.join(df_clean['comment']).split()).value_counts()[pd.Series(' '.join(df_clean['comment']).split()).value_counts()<=1].index.values.tolist()
freq_words = pd.Series(' '.join(df_clean['comment']).split()).value_counts()[:100].index.values.tolist()

In [323]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

drop_terms = rare_words
# vec = CountVectorizer(ngram_range=(1,4), analyzer='word')
vec = TfidfVectorizer(ngram_range=(1,3), analyzer='word', max_df=0.99)
X = vec.fit_transform(df_clean['comment'])
df_tmp = pd.DataFrame(X.toarray(), index=df_clean['season_nbr'] ,columns=vec.get_feature_names())
tdm = pd.DataFrame(df_tmp.transpose())
tdm = tdm[~tdm.index.isin(drop_terms)]
tdm.index.name = 'word'
tdm['sum'] = tdm.sum(axis=1)
tdm = tdm[~tdm.index.isin(tdm.sort_values(24.0, ascending=False)[:26][tdm[24.0]==tdm['sum']].index.tolist())]
blob = [TextBlob(word) for word in tdm.index.tolist()]
tdm['sentiment'] = [float(b.sentiment.polarity) for b in blob]
tdm['subjective'] = [float(b.sentiment.subjectivity) for b in blob]
tdm.sort_values('sum', ascending=False).head(25)

  


season_nbr,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,32.0,33.0,34.0,35.0,36.0,37.0,38.0,sum,sentiment,subjective
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
twist,0.001229,0.005154,0.041085,0.008981,0.032081,0.028532,0.148138,0.003715,0.007844,0.026867,...,0.076192,0.014642,0.096248,0.070499,0.08794,0.005823,0.134861,1.792758,0.0,0.0
favorite,0.016387,0.045832,0.067062,0.023956,0.0256,0.039729,0.03474,0.04764,0.06237,0.073508,...,0.059658,0.02003,0.024688,0.0,0.0,0.029872,0.005535,1.521394,0.5,1.0
winner,0.007372,0.037798,0.033615,0.005388,0.076637,0.024456,0.021163,0.007429,0.045102,0.057316,...,0.084213,0.036604,0.044687,0.016269,0.0,0.005823,0.021578,1.469009,0.0,0.0
story,0.022115,0.015463,0.00747,0.061068,0.01604,0.012228,0.038093,0.02786,0.06079,0.06269,...,0.076192,0.048805,0.017187,0.010846,0.009257,0.023292,0.0,1.309145,0.0,0.0
know,0.057746,0.022335,0.02241,0.035922,0.030298,0.02038,0.029628,0.05572,0.017649,0.023285,...,0.026066,0.019522,0.034374,0.021692,0.018514,0.0,0.021578,1.202057,0.0,0.0
island,0.013866,0.007051,0.00958,0.005528,0.016457,0.029274,0.185281,0.015245,0.008048,0.012864,...,0.008229,0.005007,0.003527,0.0,0.056984,0.011949,0.0,1.095331,0.0,0.0
rank,0.024573,0.025771,0.026145,0.025146,0.023169,0.026494,0.026806,0.035289,0.023532,0.028658,...,0.032081,0.024403,0.006875,0.005423,0.013885,0.0,0.005394,1.087317,-0.8,0.9
well,0.030254,0.017628,0.00958,0.022114,0.0256,0.050184,0.030398,0.040018,0.06237,0.027565,...,0.039086,0.04757,0.045848,0.0,0.023744,0.0,0.005535,1.07831,0.0,0.0
want,0.051602,0.044671,0.01494,0.034126,0.017823,0.01019,0.025395,0.046433,0.025493,0.035823,...,0.028071,0.036604,0.037812,0.021692,0.0,0.011646,0.010789,1.071544,0.0,0.0
come,0.049145,0.013745,0.033615,0.02335,0.023169,0.034646,0.028217,0.066864,0.01961,0.019703,...,0.032081,0.024403,0.020625,0.021692,0.018514,0.0,0.021578,1.05002,0.0,0.0


In [325]:
def df_empty(columns, dtypes, index=None):
    assert len(columns)==len(dtypes)
    df = pd.DataFrame(index=index)
    for c,d in zip(columns, dtypes):
        df[c] = pd.Series(dtype=d)
    return df
ranked = df_empty(columns = ['season','season_name','rank','word','sentiment'],dtypes=[np.float64,np.str,np.float64,np.str,np.str])
for col in tdm.drop(columns=['sentiment','subjective','sum']).columns:
    df_new = df_empty(columns = ['season','season_name','rank','word','sentiment'],dtypes=[np.float64,np.str,np.float64,np.str,np.str])
    df_new['word'] = tdm.sort_values(col, ascending=False).index.tolist()
    df_new['rank'] = df_new.index
    df_new['rank'] = pd.to_numeric(df_new['rank'])
    df_new['season'] = float(col)
    df_new['season_name'] = seasons.get(str(int(col)))
    df_new['sentiment'] = tdm.sort_values(col, ascending=False)['sentiment'].apply(lambda x: 'very positive' if x >= 0.5 else 'positive' if 0.5 > x > 0.15 else 'very negative' if x <= -0.5 else 'negative' if -0.5 < x < -0.05 else 'neutral').tolist()
    ranked = pd.concat([ranked,df_new[:26]])
ranked

Unnamed: 0,season,season_name,rank,word,sentiment
0,1.0,Borneo,0.0,borneo,neutral
1,1.0,Borneo,1.0,different,neutral
2,1.0,Borneo,2.0,order,neutral
3,1.0,Borneo,3.0,watch season,neutral
4,1.0,Borneo,4.0,television,neutral
...,...,...,...,...,...
21,38.0,Edge of\nExtinction,21.0,returnees,neutral
22,38.0,Edge of\nExtinction,22.0,idol,neutral
23,38.0,Edge of\nExtinction,23.0,fest,neutral
24,38.0,Edge of\nExtinction,24.0,low ranking,neutral


In [326]:
from plotnine import *

# Create graphic
# Define colors
color_background = "white"
color_text = "#22211d"

my_theme = (

    # Begin construction of chart
    theme_bw(base_size=15) +

    # Format background colors
    theme(panel_background = element_rect(fill=color_background, color=color_background)) +
    theme(plot_background  = element_rect(fill=color_background, color=color_background)) +
    theme(panel_border     = element_rect(color=color_background)) +
    theme(strip_background = element_rect(fill=color_background, color=color_background)) +

    # Format the grid
    theme(panel_grid_major_y = element_blank()) +
    theme(panel_grid_minor_y = element_blank()) +
    theme(axis_ticks         = element_blank()) +

    # Format the legend
    theme(legend_position = "none") +

    # Format title and axis labels
    theme(plot_title       = element_text(color=color_text, size=20, weight = "bold")) +
    theme(axis_title_x     = element_text(size=10, color="black", weight = "bold")) +
    theme(axis_title_y     = element_text(size=10, color="black", weight = "bold")) +
    theme(axis_text_x      = element_text(size=10, vjust=0.5, hjust=0.5, color = color_text)) +
    theme(axis_text_y      = element_text(size=12, color = color_text)) +
    theme(strip_text       = element_text(face = "bold")) +

    # Plot margins
    theme(figure_size = (18,6))
    )

cmap_era = {"1.0":"#F70020",
            "2.0":"#1A7D00",
            "3.0":"#0C96F2",
            "4.0":"#FB9701",
            "5.0":"#636666",
            "6.0":"#87603E",
            "7.0":"#BFBEBB",
            "8.0":"#FB9701",
            "9.0":"#636666",
            "10.0":"#87603E"
           }
cmap_tone = {"very positive":"#4CAF50","positive":"#8BC34A","neutral":"#B3B6B7","negative":"#FFC107","very negative":"#F4511E"}

In [327]:
def base_fig(data, fill_col):
    season_names = pd.Categorical(data['season_name'], categories=data['season_name'].unique().tolist())
    data = data.assign(season_name_ord = season_names)
    return(
        ggplot(data=data, mapping=aes(x='season_name_ord', y='rank')) +
        geom_point(color="black") + 
        geom_label(data=data, 
                   mapping=aes(label='word', fill=fill_col), 
                   color='white',
                   size=10,
                   label_padding=0.15) +
        geom_label(data=data.loc[data['word'].isin(propNouns)], 
           mapping=aes(label='word'), 
           fill='white',
           color='grey',
           size=10,
           label_padding=0.15) +
        scale_y_reverse(limits=[1,25], breaks=range(25,0,-1)) +
        labs(x="Season", y="Ranking",title="Most Characteristic Words Used to Describe a Season") +
        scale_fill_manual(cmap_tone)
    )

In [328]:
first_10 = ranked[ranked['season']<11]
fig1 = (base_fig(first_10, 'sentiment') + my_theme)
ggsave(plot=fig1, filename='seasons1to10.png')

  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)


In [329]:
second_10 = ranked[(ranked['season']>10) & (ranked['season']<21)]
fig2 = (base_fig(second_10, 'sentiment') + 
        my_theme  
       )
ggsave(plot=fig2, filename='seasons11to20.png')

  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)


In [330]:
third_10 = ranked[(ranked['season']>20) & (ranked['season']<31)]
fig3 = (base_fig(third_10, 'sentiment') + my_theme )
ggsave(plot=fig3, filename='seasons21to30.png')

  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)


In [331]:
fourth_10 = ranked[(ranked['season']>30)]
fig4 = (base_fig(fourth_10, 'sentiment') + my_theme )
ggsave(plot=fig1, filename='seasons31to38.png')

  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)
  self.data = self.geom.handle_na(self.data)
