# Initial Dataset Exploration (Republican Opinion)

Import necessary modules:

In [1]:
# Python STL
from pathlib import Path

# Data Analysis
import numpy as np
import pandas as pd

# Data Visualisation
import plotly.express as px

Create necessary `Path` objects:

In [2]:
root_dir = Path().cwd().parent
data_dir = root_dir / "data"

Discover datasets in `data/` directory:

In [3]:
print("CSV Files: ")
for csv in data_dir.glob("*.csv"):
    print(csv)

CSV Files: 
/Users/jakegodsall/Documents/dev/data-science/reddit-political-opinion/data/reddit_opinion_democrats.csv
/Users/jakegodsall/Documents/dev/data-science/reddit-political-opinion/data/reddit_opinion_republican.csv


## Republican Opinion Dataset Initial Exploration

Load the dataset into a `pandas.DataFrame`:

In [5]:
rep_df = pd.read_csv(data_dir / "reddit_opinion_republican.csv", delimiter=",")

rep_df.head()

Unnamed: 0,comment_id,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
0,kc0w1oa,1,"You're right, how could I forget about such a ...",politics,2023-12-04 23:42:36,18avilb,MinimumApricot365,0,1,0,...,1.0,1605.0,1606.0,124,,House GOP touts Hunter Biden payments; Counsel...,0.87,124,0,2023-12-04 21:57:26
1,kc0w1d4,1,they need more bootstraps,politics,2023-12-04 23:42:32,18ax8d5,pkinetics,0,1,0,...,622.0,106124.0,112104.0,16,,Divided Michigan GOP fractures further amid bi...,0.94,16,0,2023-12-04 23:11:27
2,kc0w0s3,1,I think this is said about every President goi...,trump,2023-12-04 23:42:26,18atl5a,PacknPaddle,0,1,0,...,1384.0,2030.0,3444.0,42,,BREAKING: Former Republican Rep. Liz Cheney sa...,0.84,42,0,2023-12-04 20:34:01
3,kc0w07d,1,Jesus how I hate Sanderson and the concept of ...,changemyview,2023-12-04 23:42:19,18apnui,Mu-Relay,0,1,0,...,1.0,193664.0,194613.0,0,If you don't want to read everything please st...,CMV: Most complaints about diversity in fantas...,0.44,0,0,2023-12-04 17:47:57
4,kc0vzx4,1,"He’ll break the space time continuum, re-ensla...",trump,2023-12-04 23:42:16,18atl5a,Cambionr,0,1,0,...,6658.0,69965.0,76802.0,42,,BREAKING: Former Republican Rep. Liz Cheney sa...,0.84,42,0,2023-12-04 20:34:01


Determine size of the dataset:

In [6]:
print("Number of rows: ", rep_df.size)

Number of rows:  5839368


Check for empty values:

In [7]:
rep_df.isna().sum()

comment_id                         0
score                              0
self_text                          2
subreddit                          0
created_time                       0
post_id                            0
author_name                        0
controversiality                   0
ups                                0
downs                              0
user_is_verified                4851
user_account_created_time       4851
user_awardee_karma                 9
user_awarder_karma                 9
user_link_karma                    9
user_comment_karma                 9
user_total_karma                   9
post_score                         0
post_self_text                194710
post_title                         0
post_upvote_ratio                  0
post_thumbs_ups                    0
post_total_awards_received         0
post_created_time                  0
dtype: int64

Determining values of categorical variables:

In [8]:
rep_df.loc[:, "score"].value_counts()

score
 1       42995
 2       28780
 3       21891
 5       11559
 4       11273
         ...  
 743         1
 1127        1
 1100        1
 1400        1
-81          1
Name: count, Length: 1560, dtype: int64

In [9]:
rep_df.loc[:, "subreddit"].unique()

array(['politics', 'trump', 'changemyview', 'WhitePeopleTwitter',
       'AskReddit', 'VoteDEM', 'Conservative', 'neoliberal',
       'Political_Revolution', 'RepublicanValues', 'PoliticalHumor',
       'conspiracy', 'democrats', 'uspolitics', 'Republican',
       'Republican_misdeeds', 'AskThe_Donald', 'Libertarian',
       'ShitPoliticsSays', 'WayOfTheBern', 'conservatives',
       'ConservativesOnly', 'news', 'progressive', 'Republican_memes',
       'SandersForPresident', 'EnoughTrumpSpam', 'republicanmemes',
       'ChristianDemocrat', 'ConservativeDemocrat'], dtype=object)

In [10]:
rep_df.loc[:, "subreddit"].value_counts()

subreddit
politics                105201
VoteDEM                  26826
WhitePeopleTwitter       20088
PoliticalHumor           14259
democrats                12350
neoliberal                9653
Conservative              7140
ShitPoliticsSays          5187
changemyview              5146
Libertarian               4657
conspiracy                4004
Republican                3324
uspolitics                2857
Political_Revolution      2758
trump                     2726
EnoughTrumpSpam           2503
progressive               2272
AskThe_Donald             2182
news                      1887
RepublicanValues          1654
Republican_misdeeds       1511
SandersForPresident       1477
WayOfTheBern               985
conservatives              903
AskReddit                  651
ConservativesOnly          329
ChristianDemocrat          306
republicanmemes            246
Republican_memes           113
ConservativeDemocrat       112
Name: count, dtype: int64

In [11]:
rep_df.loc[:, "controversiality"].unique()

array([0, 1])

In [12]:
rep_df.describe()

Unnamed: 0,score,controversiality,ups,downs,user_awardee_karma,user_awarder_karma,user_link_karma,user_comment_karma,user_total_karma,post_score,post_upvote_ratio,post_thumbs_ups,post_total_awards_received
count,243307.0,243307.0,243307.0,243307.0,243298.0,243298.0,243298.0,243298.0,243298.0,243307.0,243307.0,243307.0,243307.0
mean,23.965225,0.027952,23.965225,0.0,1063.599487,446.42344,27726.57,99678.22,128914.8,2688.096906,0.89826,2688.096906,0.0
std,145.425713,0.164837,145.425713,0.0,4294.144843,3303.531337,229528.4,188540.5,324801.3,4611.441057,0.152656,4611.441057,0.0
min,-301.0,0.0,-301.0,0.0,0.0,0.0,0.0,-100.0,-99.0,0.0,0.05,0.0,0.0
25%,2.0,0.0,2.0,0.0,22.0,0.0,39.0,7955.0,9340.0,110.0,0.9,110.0,0.0
50%,5.0,0.0,5.0,0.0,220.0,0.0,744.0,32905.0,38433.0,756.0,0.95,756.0,0.0
75%,14.0,0.0,14.0,0.0,778.0,100.0,5702.0,110298.0,127610.0,3365.0,0.97,3365.0,0.0
max,11631.0,1.0,11631.0,0.0,634187.0,655609.0,13954100.0,3840682.0,14921860.0,36207.0,1.0,36207.0,0.0


In [13]:
rep_df.loc[:, "controversiality"].value_counts()

controversiality
0    236506
1      6801
Name: count, dtype: int64

In [14]:
fig = px.histogram(data_frame=rep_df, x="score", log_y=True)
fig.show()