## 02 EDA- sia

In [33]:
# imports and libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.options.display.max_colwidth = 500

### pickle in all_witcher_5000 df and format

In [2]:
# pickle in all_witcher_5000 dataframe
with open('dataframes/all_witcher_5000.pkl', mode= 'rb') as pickle_in: 
    all_witcher_5000= pickle.load(pickle_in)

In [3]:
all_witcher_5000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9781 entries, 0 to 4994
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   subreddit     9781 non-null   object 
 1   title         9781 non-null   object 
 2   selftext      9781 non-null   object 
 3   score         9781 non-null   int64  
 4   upvote_ratio  9781 non-null   float64
 5   num_comments  9781 non-null   int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 534.9+ KB


In [11]:
# drop index so range is reflective of current total rows
all_witcher_5000.reset_index(drop= True, inplace= True)

In [12]:
all_witcher_5000

Unnamed: 0,subreddit,title,selftext,score,upvote_ratio,num_comments
0,Witcher3,Why did CD never patch the Wolf set bug?,This bug has always frustrated me. Why couldn'...,1,1.0,0
1,Witcher3,The heart of the woods quest,"As you may remember/ know, the quest has two c...",1,1.0,0
2,Witcher3,Bought the complete edition on PS4,I had this in Xbox but ended up moving and lef...,1,1.0,0
3,Witcher3,This merchant will give you 10 crowns minimum ...,,1,1.0,0
4,Witcher3,When will my winter berry’s grow back,Ice tried everything saving and quitting skipp...,1,1.0,0
...,...,...,...,...,...,...
9776,netflixwitcher,Ciri waiting for her skype partner to show up ...,,1,1.0,0
9777,netflixwitcher,Freya Allan,Has anyone seen Freya Allan's Instagram? I'm a...,1,1.0,13
9778,netflixwitcher,Some will say the artistic direction of the sh...,,1,1.0,69
9779,netflixwitcher,Some will say the artistic direction of the sh...,&amp;#x200B;\n\nhttps://preview.redd.it/25979h...,1,1.0,2


### sentiment intensity analyzer

In [52]:
sia = SentimentIntensityAnalyzer()

# caculate sia scores for all titles in df
scores = [sia.polarity_scores(each) for each in all_witcher_5000['title']]
sia_scores_df = pd.DataFrame(scores)

# concatenate sia_scores and all_witcher_5000 df
all_witcher_sia_df = pd.concat([all_witcher_5000.drop(columns= ['selftext']), sia_scores_df], axis=1, ignore_index= False)

In [54]:
# inspired by https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/

# function to give qualitative score representing compound value
def sia_rating(compound_score):
    if compound_score >= -0.05 and compound_score <= 0.05:
        rating= 'neutral'
    elif compound_score >= 0.8:
        rating= 'very positive'
    elif compound_score <= -0.8:
        rating= 'very negative'
    elif compound_score < 0.8 and compound_score > 0.05:
        rating= 'postive'
    else:
        rating= 'negative'
    return rating

all_witcher_sia_df['sia_rating']= all_witcher_sia_df['compound'].map(sia_rating)

In [55]:
# make new columns showing neu, neg, and pos columns as # out of 100 instead of # out of 1

all_witcher_sia_df['neg_%']= 100* all_witcher_sia_df['neg']
all_witcher_sia_df['neu_%']= 100* all_witcher_sia_df['neu']
all_witcher_sia_df['pos_%']= 100* all_witcher_sia_df['pos']

all_witcher_sia_df.drop(columns= ['neu', 'neg', 'pos'])

In [57]:
# averages by subreddit

all_witcher_sia_df.drop(columns= ['neu', 'neg', 'pos']).groupby(by= ['subreddit']).mean()

Unnamed: 0_level_0,score,upvote_ratio,num_comments,compound,neg_%,neu_%,pos_%
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Witcher3,3.022755,0.997122,5.102094,0.082109,7.33528,79.82209,12.701228
netflixwitcher,10.510903,0.984816,10.591485,0.100108,4.282783,85.961412,9.693479


In [60]:
# averages by sia_rating

all_witcher_sia_df.drop(columns= ['neu', 'neg', 'pos']).groupby(by= ['sia_rating']).mean()

Unnamed: 0_level_0,score,upvote_ratio,num_comments,compound,neg_%,neu_%,pos_%
sia_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
negative,4.718281,0.988895,9.474761,-0.411616,33.027149,64.002319,2.969509
neutral,8.334513,0.992376,7.40684,5.5e-05,0.20169,99.384178,0.217531
postive,5.334121,0.990422,7.879177,0.461054,1.674452,65.884981,32.440634
very negative,1.169492,0.981186,3.101695,-0.851639,41.157627,56.179661,2.657627
very positive,2.068293,0.986146,6.0,0.863742,1.048293,59.877073,39.072683


In [88]:
# make new df of extreme and neutral ratings 

all_witcher_sia_neut= all_witcher_sia_df[all_witcher_sia_df['sia_rating'] == 'neutral'] # new df of all neutral posts
all_witcher_sia_vpos= all_witcher_sia_df[all_witcher_sia_df['sia_rating'] == 'very positive'] # new df of all very positive posts
all_witcher_sia_vneg= all_witcher_sia_df[all_witcher_sia_df['sia_rating'] == 'very negative'] # new df of all very positive posts

all_witcher_sia_vneg.sort_values(by= ['score'], ascending= False).head(50)

Unnamed: 0,subreddit,title,score,upvote_ratio,num_comments,ratioed,neg,neu,pos,compound,sia_rating,neg_%,neu_%,pos_%
9243,netflixwitcher,"Sorry, but this was the first thing came in my mind, when I heard that people criticized and even hated the season because of the confusing timeline.",12,0.7,10,0,0.363,0.637,0.0,-0.9109,very negative,36.3,63.7,0.0
10,Witcher3,"That look of someone who won't treat Geralt like a dog anymore, who won't manipulate him, who won't have someone else to take out their anger and frustration with, who won't disagree with an opinion just for the sake of disagreeing, who will never decide Corvo Bianco Profits Finance.",1,1.0,0,0,0.227,0.687,0.085,-0.8168,very negative,22.7,68.7,8.5
4880,Witcher3,"Im trying deathmarch but I changed the difficulty to broken bones. It was just in the settings menu, will it have an impact on the achvievement?",1,0.99,4,1,0.24,0.76,0.0,-0.8047,very negative,24.0,76.0,0.0
3520,Witcher3,Sometimes we must join forces with those who insult us and growl at us to fight the common enemy,1,1.0,2,1,0.353,0.564,0.083,-0.802,very negative,35.3,56.4,8.3
3603,Witcher3,Right after I finished lifting the curse from Uldaryk(forgot the spelling) where I had to trick the hym by doing something I regret. Gerald’s face started having some weird scars . Why’s that?,1,1.0,3,1,0.269,0.731,0.0,-0.802,very negative,26.9,73.1,0.0
3638,Witcher3,"First try, killed a lvl 64 Grave hag being lvl 41 with gear for lvl 34 on Death March with the rised enemies lvl :D",1,1.0,3,1,0.381,0.51,0.108,-0.8798,very negative,38.1,51.0,10.8
3691,Witcher3,I never thought that the toughest enemy on death march are gonna be fucking rats,1,1.0,7,1,0.442,0.558,0.0,-0.8271,very negative,44.2,55.8,0.0
3974,Witcher3,Man too angry to die spotted on Novigrad.,1,1.0,1,0,0.545,0.455,0.0,-0.802,very negative,54.5,45.5,0.0
4076,Witcher3,finished this game for the first time but I killed Ciri and Geralt now I’m dead inside lol,1,1.0,12,1,0.408,0.468,0.124,-0.8885,very negative,40.8,46.8,12.4
4210,Witcher3,"When you kill the Brewess and the Weavess says ""You will regret that!""",1,1.0,1,0,0.415,0.585,0.0,-0.8313,very negative,41.5,58.5,0.0


### ratioed and controversial posts

In [61]:
# make column to look at whethere a post is ratioed, 1 is controversial, 0 is ok
all_witcher_sia_df['ratioed']= np.where((all_witcher_sia_df['num_comments'] > all_witcher_sia_df['score']), 1, 0)

In [73]:
# averages by whether post is ratioed

all_witcher_sia_df.drop(columns= ['neu', 'neg', 'pos']).groupby(by= ['ratioed']).mean(['score', 'num_comments'])

Unnamed: 0_level_0,score,upvote_ratio,num_comments,compound,neg_%,neu_%,pos_%
ratioed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,9.711977,0.995024,0.832066,0.092258,6.096784,82.026489,11.718124
1,1.245027,0.98386,20.49092,0.088626,5.351888,84.332517,10.315249


In [63]:
# averages by subredit

all_witcher_sia_df.drop(columns= ['neu', 'neg', 'pos']).groupby(by= ['subreddit']).mean()

Unnamed: 0_level_0,score,upvote_ratio,num_comments,ratioed,compound,neg_%,neu_%,pos_%
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Witcher3,3.022755,0.997122,5.102094,0.265807,0.082109,7.33528,79.82209,12.701228
netflixwitcher,10.510903,0.984816,10.591485,0.446314,0.100108,4.282783,85.961412,9.693479


In [71]:
# make new df of posts marked as ratioed or controversial with corresponding sia info

all_witcher_sia_ratio= pd.concat([all_witcher_sia_df.drop(columns= ['neu', 'neg', 'pos'])], axis=1, ignore_index= False)
all_witcher_sia_ratio= all_witcher_sia_ratio[all_witcher_sia_ratio['ratioed'] == 1] # keep only controverial posts
all_witcher_sia_ratio= all_witcher_sia_ratio[all_witcher_sia_ratio['upvote_ratio'] <= 0.5] # keep only controverial posts

all_witcher_sia_ratio.shape # how many posts have been marked as ratioed or controversial

(52, 11)

In [89]:
all_witcher_sia_ratio.sort_values(by= 'subreddit')

Unnamed: 0,subreddit,title,score,upvote_ratio,num_comments,ratioed,compound,sia_rating,neg_%,neu_%,pos_%
3104,Witcher3,Can't find any silver swords,0,0.5,7,1,0.0,neutral,0.0,100.0,0.0
3113,Witcher3,How can they get the cast so right and so wrong at the same time? Lord of the Rings and GoT were amazing because the cast was perfect. Witcher is already at a disadvantage because of the awful casting decisions,0,0.5,219,1,-0.3163,negative,19.1,66.3,14.6
3132,Witcher3,Yennefer or Triss?,0,0.46,26,1,0.0,neutral,0.0,100.0,0.0
3134,Witcher3,Lost eye of Nehaleni!,0,0.5,9,1,-0.3802,negative,46.4,53.6,0.0
3137,Witcher3,Is this normally the amount of xp I'd get?,0,0.33,7,1,0.3818,postive,0.0,75.5,24.5
3144,Witcher3,Gwent hate post? Really? I started playing more so to play gwent then actually complete the story.. you accept Henry Cavill but not gwent?!?! The blasphemy.,0,0.42,3,1,0.543,postive,7.8,69.5,22.7
4911,Witcher3,Triss Merigold by breepng,0,0.5,1,1,0.0,neutral,0.0,100.0,0.0
8656,netflixwitcher,My husband drew this last night and I thought you all might like it as much as I do,0,0.5,24,1,0.3612,postive,0.0,86.5,13.5
8759,netflixwitcher,Jaskier being a bi icon for 7 minutes straight (with wii music),0,0.25,2,1,0.2263,postive,0.0,82.6,17.4
8762,netflixwitcher,Joey Batey looks too familiar,0,0.5,1,1,0.0,neutral,0.0,100.0,0.0
