# Data cleaning and exploratory data analysis

## Preparations

In [1]:
import pandas as pd
from skimpy import skim

In [2]:
history = pd.read_csv("../data/history.csv")
alternative_history = pd.read_csv("../data/alternative_history.csv")

## Common columns

Our databases do not have the same number of features--119 vs 123--and because some subreddits may moderate some features to be constant, we arbitrarily select only the columns we think are useful in this classification task.
<br>
Some examples of conflicting features in our datasets include the `removal_reason` and `author_flair_text` columns.

In [3]:
print(history["removal_reason"].unique())
print(alternative_history["removal_reason"].unique())

[nan]
[nan 'legal']


In [4]:
print(history["author_flair_text"].unique())
print(alternative_history["author_flair_text"].unique())

["I've been called many things, but never fun." nan
 'Waiting for the Roman Empire to reform' 'Four Time Hero of /r/History'
 'History of Witchcraft' 'Chief Technologist, Fleet Admiral '
 'Probably the handsomest person here' 'Quite the arrogant one.'
 'Supreme Allied Commander' 'What, were you expecting something witty?'
 'history ' 'Hic sunt dracones' 'archaeologist of new, week 28']
[nan]


Our selection of columns includes:

In [5]:
common_columns = [
    "selftext",
    "gilded",
    "title",
    "downs",
    "thumbnail_height",
    "upvote_ratio",
    "ups",
    "thumbnail_width",
    "score",
    "author_premium",
    "edited",
    "is_self",
    "created",
    "domain",
    "allow_live_comments",
    "no_follow",
    "over_18",
    "spoiler",
    "locked",
    "distinguished",
    "author",
    "num_comments",
    "send_replies",
    "stickied",
    "url",
    "num_crossposts"
]

And we check if we can use these columns in our analysis--in other words, we check if they have the same nominal and ordinal values and similar numeric values.

In [6]:
for column in common_columns:
    print(f"{column} -- {history[column].nunique()} -- {alternative_history[column].nunique()}")

selftext -- 511 -- 518
gilded -- 7 -- 3
title -- 1805 -- 1892
downs -- 1 -- 1
thumbnail_height -- 81 -- 86
upvote_ratio -- 56 -- 80
ups -- 1468 -- 588
thumbnail_width -- 2 -- 2
score -- 1468 -- 588
author_premium -- 2 -- 2
edited -- 221 -- 90
is_self -- 2 -- 2
created -- 1935 -- 1905
domain -- 443 -- 160
allow_live_comments -- 2 -- 2
no_follow -- 2 -- 2
over_18 -- 2 -- 2
spoiler -- 2 -- 2
locked -- 2 -- 2
distinguished -- 1 -- 1
author -- 921 -- 669
num_comments -- 763 -- 264
send_replies -- 2 -- 2
stickied -- 2 -- 2
url -- 1933 -- 1894
num_crossposts -- 23 -- 12


For now, for further analysis, we reduce our datasets to only those columns. Later, we will use transformers to properly modify the datasets.

In [7]:
history = history[common_columns]
alternative_history = alternative_history[common_columns]

Next, we conduct a correlation analysis.

## Correlation analysis

In [8]:
history_correlation_matrix = history.select_dtypes("number").corr()
alternative_history_correlation_matrix = alternative_history.select_dtypes("number").corr()

In [9]:
history_correlation_matrix

Unnamed: 0,gilded,downs,thumbnail_height,upvote_ratio,ups,thumbnail_width,score,created,num_comments,num_crossposts
gilded,1.0,,0.046968,0.051037,0.225373,0.007946,0.225373,-0.109017,0.14273,0.114415
downs,,,,,,,,,,
thumbnail_height,0.046968,,1.0,-0.064668,-0.102524,0.09407,-0.102524,0.092982,-0.113776,0.011264
upvote_ratio,0.051037,,-0.064668,1.0,0.150993,0.020325,0.150993,-0.150364,0.06752,0.128215
ups,0.225373,,-0.102524,0.150993,1.0,-0.044717,1.0,-0.641972,0.508919,0.143386
thumbnail_width,0.007946,,0.09407,0.020325,-0.044717,1.0,-0.044717,0.039182,-0.011508,0.019635
score,0.225373,,-0.102524,0.150993,1.0,-0.044717,1.0,-0.641972,0.508919,0.143386
created,-0.109017,,0.092982,-0.150364,-0.641972,0.039182,-0.641972,1.0,-0.596421,0.055959
num_comments,0.14273,,-0.113776,0.06752,0.508919,-0.011508,0.508919,-0.596421,1.0,-0.020672
num_crossposts,0.114415,,0.011264,0.128215,0.143386,0.019635,0.143386,0.055959,-0.020672,1.0


In [10]:
alternative_history_correlation_matrix

Unnamed: 0,gilded,downs,thumbnail_height,upvote_ratio,ups,thumbnail_width,score,created,num_comments,num_crossposts
gilded,1.0,,0.000531,0.060992,0.087049,0.001603,0.087049,-0.078723,0.041897,0.09621
downs,,,,,,,,,,
thumbnail_height,0.000531,,1.0,0.098515,0.24892,0.041744,0.24892,-0.119981,0.14805,0.049409
upvote_ratio,0.060992,,0.098515,1.0,0.440137,-0.005284,0.440137,-0.563886,0.206804,0.180516
ups,0.087049,,0.24892,0.440137,1.0,0.017049,1.0,-0.412276,0.538785,0.376692
thumbnail_width,0.001603,,0.041744,-0.005284,0.017049,1.0,0.017049,-0.018335,0.016421,0.011147
score,0.087049,,0.24892,0.440137,1.0,0.017049,1.0,-0.412276,0.538785,0.376692
created,-0.078723,,-0.119981,-0.563886,-0.412276,-0.018335,-0.412276,1.0,-0.132865,-0.214778
num_comments,0.041897,,0.14805,0.206804,0.538785,0.016421,0.538785,-0.132865,1.0,0.293982
num_crossposts,0.09621,,0.049409,0.180516,0.376692,0.011147,0.376692,-0.214778,0.293982,1.0


In [11]:
for i in range(len(history_correlation_matrix.columns)):
    for j in range(i+1, len(history_correlation_matrix.columns)):
        correlation = history_correlation_matrix.iloc[i, j]
        if correlation >= 0.7:
            print(f"Correlation between columns '{history_correlation_matrix.columns[i]}' and '{history_correlation_matrix.columns[j]}' is {correlation}")

Correlation between columns 'ups' and 'score' is 1.0


In [12]:
for i in range(len(alternative_history_correlation_matrix.columns)):
    for j in range(i+1, len(alternative_history_correlation_matrix.columns)):
        correlation = alternative_history_correlation_matrix.iloc[i, j]
        if correlation >= 0.7:
            print(f"Correlation between columns '{alternative_history_correlation_matrix.columns[i]}' and '{alternative_history_correlation_matrix.columns[j]}' is {correlation}")

Correlation between columns 'ups' and 'score' is 1.0


Next, we examine each column.

## Summary statistics

In [13]:
skim(history)

In [14]:
skim(alternative_history)

From our summary statistics, we learn that our distributions may need to be normalized.

## Column analysis

### `ups` and `score`

Since both columns are identical, we decide to only use `score`

In [15]:
column = "score"
print(history[column].unique())
print(alternative_history[column].unique())

[   54 22382  4591 ...  6764  2852 10358]
[ 178  198  195  176  317  187  511  566  148   62    0  823  174   32
  214  568   15    2   52  280   92  841  299  379  284   13  277  225
  384  202  179  304  345  167  467  294    1  308   34   26   16    4
  279  832   23  246   10  200    7   11  224    5  319   17   54   14
   42  229  492  278  201  565   77  172  296   48  838   81  173   12
    3  270  184   20   35  194  688   78  570  183  196  634  234  715
  322  180  596  268   47  562    9  346  276  249  654 2700  351  316
  182  522  435  723  111   56  264  395  245  282   30  231  641  514
   19  129   61   28  451  192   46  320  675  150 2641  438  748  454
    6  783    8  321  275  175   18   57  934   31  104   84  215  191
  177  273  353   43  269  413  220  226   25   66  589   33  329  238
  845  190  212  577  303  484  330  448  849  486  302  408  325  186
  430  406  263  441   21   36  288 2012  102  651  101  959  503  477
 1334  218  250  137  341  375  189

Given this is a numeric column and we can observe some variance in both datasets, we keep it.

### `selftext`

In [16]:
column = "selftext"
print(history[column].sample(5))
print(alternative_history[column].sample(5))

373                                                   NaN
160                                                   NaN
1179    Hi everybody,\n\nWelcome to our weekly book re...
1578                                                  NaN
194                                                   NaN
Name: selftext, dtype: object
1710                                                  NaN
135                                                   NaN
1711    Of all the advances made today, know that none...
968                                                   NaN
490                                                   NaN
Name: selftext, dtype: object


In [17]:
history[column].isnull().sum()
alternative_history[column].isnull().sum()

1386

`selftext` is a crucial column. We can use NLP techniques to help us distill this data into an accurate subreddit categorization. However, do note that many posts do not have `selftext`s. In those cases, other columns, including the mandatory `title` will help us determine a post's category.

### `gilded`

In [18]:
column = "gilded"
print(history[column].unique())
print(alternative_history[column].unique())

[0 1 3 2 4 5 6]
[0 2 1]


In [19]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

gilded
0    1842
1      77
2       6
5       4
3       3
4       2
6       1
Name: count, dtype: int64
gilded
0    1893
1      10
2       2
Name: count, dtype: int64


Given this is a numeric column and we can observe some variance in both datasets, we keep it.

### `title`

In [20]:
column = "title"
print(history[column].unique())
print(alternative_history[column].unique())

['A lecture of the Assyrian Empire'
 'With the surprising number of commenters in various subreddits that continue to confidently assert that the Holocaust is a hoax, it’s worth reviewing the prosecution and conviction of Adolf Eichmann.'
 '18 min Video Documentary: Life in Germany after WWII. Produced by the US government to explain to the American people the situation in Germany and how America was driving reconstruction.'
 ... 'Over 2,000 Mummified Sheep Heads Unearthed In Egypt Temple'
 'Excavations carried out in Iraqi Kurdistan have revealed an ancient city that stood at the heart of an unknown kingdom: that of the mountain people, who had until then remained in the shadow of their powerful Mesopotamian neighbours.'
 "Why Julius Caesar's Year of Confusion was the longest year in history"]
["Workers celebrating the establishment of the Australian People's Republic at Sydney Harbour, Oct 7th 1960."
 'Mysterious structure found at bottom of ancient lake in 2013. Possibly 12,000 year

Like `selftext`, `title` is also crucial. The words in `title` could greatly help our classificator. To simplify our preprocessing, we could feature engineer a column by appending `title` to `selftext`.

### `downs`

In [21]:
column = "downs"
print(history[column].unique())
print(alternative_history[column].unique())

[0]
[0]


We will not use the `downs` column because there it has a single value in both datasets.

### `thumbnail_height`

In [22]:
column = "thumbnail_height"
print(history[column].unique())
print(alternative_history[column].unique())

[105.  nan 107. 140.  73.  68.  84.  78.  93.  92.  87.  83. 128.  89.
 101.  72.  74.  71.  80.  95. 100.  86.  98.  62.  97.  70.  91. 131.
  94.  79. 104.  65. 110. 114. 111. 102.  90.  67.  88.  81. 113.  66.
 106. 122.  69.  77. 120.  26. 103.  99.  82. 123.  75. 109.  33.  28.
  64. 115. 112. 116.  53. 127.  51.  59.  40. 108.  46.  76. 119.  36.
  85.  56. 125.  43.  50.  49. 121.  96.  48.  44. 134. 139.]
[140.  78. 105.  56.  51.  75.  nan 108.  63. 100.  36.  64.  72. 131.
  73.  93. 123. 112. 139.  65.  71.  85.  88. 104.  89. 129.  87. 122.
  70. 125. 133. 107.  91.  67.  84.  79. 120.  58.  80.  97. 137.  99.
 124.  92. 117.  94.  90.  68.  98.  76. 128.  96. 106. 126. 118. 109.
  83.  81.  95. 102.  82. 113.  74. 134. 115. 114. 101.  61. 119. 135.
  59. 103.  77.  60.  66. 110. 138. 121.  86. 136.  52.  62.  44. 111.
 132.  57.  47.]


Given this is a numeric column and we can observe some variance in both datasets, we keep it. However, note the nan values. We must address them in the preprocessing stage.

### `thumbnail_width`

In [23]:
column = "thumbnail_width"
print(history[column].unique())
print(alternative_history[column].unique())

[140.  nan  70.]
[140.  nan  70.]


In [24]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

thumbnail_width
140.0    1230
70.0        3
Name: count, dtype: int64
thumbnail_width
140.0    1581
70.0        1
Name: count, dtype: int64


This is a nominal feature and all categories are present in both datasets. However, we drop it because almost 100% of posts with thumbnails have the same width.

### `upvote_ratio`

In [25]:
column = "upvote_ratio"
print(history[column].unique())
print(alternative_history[column].unique())

[0.92 0.86 0.95 0.93 0.87 0.91 0.88 0.9  0.79 0.89 0.83 0.97 0.84 0.96
 0.74 0.85 0.78 0.75 0.94 0.73 1.   0.77 0.98 0.82 0.81 0.8  0.7  0.38
 0.76 0.63 0.62 0.27 0.72 0.66 0.68 0.6  0.64 0.99 0.56 0.35 0.71 0.45
 0.34 0.37 0.65 0.5  0.57 0.31 0.55 0.69 0.52 0.41 0.43 0.58 0.59 0.51]
[0.95 0.94 0.98 0.97 0.99 0.86 0.74 0.71 0.33 0.84 0.9  0.96 0.76 0.62
 0.64 0.77 0.82 1.   0.41 0.32 0.91 0.53 0.56 0.6  0.48 0.69 0.83 0.5
 0.87 0.92 0.63 0.78 0.88 0.73 0.8  0.51 0.49 0.54 0.93 0.59 0.38 0.42
 0.67 0.89 0.72 0.4  0.85 0.81 0.75 0.26 0.36 0.61 0.68 0.45 0.43 0.31
 0.65 0.58 0.27 0.66 0.7  0.57 0.46 0.08 0.79 0.55 0.39 0.47 0.22 0.44
 0.29 0.35 0.52 0.14 0.3  0.37 0.19 0.24 0.13 0.34]


Given this is a numeric column and we can observe some variance in both datasets, we keep it.

### `author_premium`

In [26]:
column = "author_premium"
print(history[column].unique())
print(alternative_history[column].unique())

[False nan True]
[False nan True]


In [27]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

author_premium
False    1635
True      170
Name: count, dtype: int64
author_premium
False    1752
True       83
Name: count, dtype: int64


This is a nominal feature and all categories are present in both datasets. However, there is no documentation available that explains what the null values are, meaning we cannot accurately impute them. Hence, we do not use `author_premium`.

### `edited`

In [28]:
column = "edited"
print(history[column].unique())
print(alternative_history[column].unique())

['False' '1525287103.0' '1499089255.0' '1490490634.0' '1554923443.0'
 '1705976665.0' '1589006419.0' '1496032043.0' '1541718365.0'
 '1585516697.0' '1555075822.0' '1602628810.0' '1510237520.0'
 '1676498687.0' '1541365072.0' '1469134982.0' '1591194069.0'
 '1517337000.0' '1498694041.0' '1589873210.0' '1541633808.0'
 '1700672347.0' '1499103091.0' '1468162376.0' '1550097001.0'
 '1548878107.0' '1547654031.0' '1511285780.0' '1514763328.0'
 '1478535018.0' '1492021117.0' '1473533262.0' '1702511983.0'
 '1588682038.0' '1488099790.0' '1589414285.0' '1490554625.0'
 '1539606915.0' '1516579863.0' '1640368541.0' '1566336064.0'
 '1441505383.0' '1553270204.0' '1553025666.0' '1582488869.0'
 '1499447998.0' '1518723093.0' '1503360658.0' '1512881638.0'
 '1475666354.0' '1679735124.0' '1561741573.0' '1705984905.0'
 '1692737887.0' '1526755500.0' '1471948866.0' '1529352314.0'
 '1547744138.0' '1554236829.0' '1478869026.0' '1462033654.0'
 '1431904731.0' '1474555207.0' '1563894278.0' '1584621182.0'
 '1491653092.0' 

This is a hybrid column, which appears to log `False` is a post was not edited or an UNIX time metric if it was. We decided to keep and use this column for feature engineering.

### `is_self`

In [29]:
column = "is_self"
print(history[column].unique())
print(alternative_history[column].unique())

[False  True]
[False  True]


In [30]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

is_self
False    1311
True      624
Name: count, dtype: int64
is_self
False    1535
True      370
Name: count, dtype: int64


Given this is a nominal feature and all categories are present in both datasets, we keep it.

### `created`

In [31]:
column = "created"
print(history[column].unique())
print(alternative_history[column].unique())

[1.68519530e+09 1.52526470e+09 1.56897747e+09 ... 1.67984531e+09
 1.55343405e+09 1.70922532e+09]
[1.60492640e+09 1.68267237e+09 1.59562903e+09 ... 1.58781678e+09
 1.65505511e+09 1.70692313e+09]


Given this is a numeric column and we can observe some variance in both datasets, we keep it. Furthermore, we could use `created` to feature engineer the times in which the posts were created. We may find something interesting.

### `domain`

In [32]:
column = "domain"
print(history[column].unique())
print(alternative_history[column].unique())

['youtube.com' 'self.history' 'youtu.be' 'historytoday.com'
 'viking.ucla.edu' 'theverge.com' 'theconversation.com' 'nautil.us'
 'cardiff.ac.uk' 'frommers.com' 'msn.com' 'theguardian.com'
 '63percentscottish.com' 'telegraph.co.uk' '1stalabamacavalryusv.com'
 'bbc.com' 'arkeonews.net' 'canadiangeographic.ca' 'english.ahram.org.eg'
 'mentalfloss.com' 'oceanofwisdom.in' 'bbc.co.uk' 'nytimes.com' 'hrw.org'
 'cnn.com' 'zmescience.com' 'sciencemag.org' '27thtankbattalion.com'
 'en.mercopress.com' 'open.spotify.com' 'mymodernmet.com'
 'sciencenewsjournal.com' 'heritagedaily.com' 'mpg.de'
 'clickondetroit.com' 'time.com' 'newsinteractives.cbc.ca'
 'theophthalmologist.com' 'web.archive.org' 'academia.edu'
 'news-artnet-com.cdn.ampproject.org' 'purdue.edu' 'thediplomat.com'
 'phys.org' 'scientificamerican.com' 'magazine.uc.edu'
 'smithsonianmag.com' 'washingtonpost.com' 'livescience.com'
 'edition.cnn.com' 'vice.com' 'nbcnews.com' 'mused.org'
 'huffingtonpost.co.uk' 'aeon.co' 'thecollector.com' 

In [33]:
print(history[column].isna().sum())
print(alternative_history[column].isna().sum())

0
0


This information may help us make more accurate predictions. Hence, we keep it.

### `allow_live_comments`

In [34]:
column = "allow_live_comments"
print(history[column].unique())
print(alternative_history[column].unique())

[False  True]
[False  True]


In [35]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

allow_live_comments
False    1458
True      477
Name: count, dtype: int64
allow_live_comments
False    1243
True      662
Name: count, dtype: int64


Given this is a nominal feature and all categories are present in both datasets, we keep it.

### `no_follow`

In [36]:
column = "no_follow"
print(history[column].unique())
print(alternative_history[column].unique())

[False  True]
[False  True]


In [37]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

no_follow
False    1923
True       12
Name: count, dtype: int64
no_follow
False    1728
True      177
Name: count, dtype: int64


Given this is a nominal feature and all categories are present in both datasets, we keep it.

### `over_18`

In [38]:
column = "over_18"
print(history[column].unique())
print(alternative_history[column].unique())

[False  True]
[False  True]


In [39]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

over_18
False    1928
True        7
Name: count, dtype: int64
over_18
False    1900
True        5
Name: count, dtype: int64


This is a nominal feature and all categories are present in both datasets. However, we drop it because almost 100% of posts are not spoilers.

### `spoiler`

In [40]:
column = "spoiler"
print(history[column].unique())
print(alternative_history[column].unique())

[False  True]
[False  True]


In [41]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

spoiler
False    1934
True        1
Name: count, dtype: int64
spoiler
False    1903
True        2
Name: count, dtype: int64


This is a nominal feature and all categories are present in both datasets. However, we drop it because almost 100% of posts are not spoilers.

### `locked`

In [42]:
column = "locked"
print(history[column].unique())
print(alternative_history[column].unique())

[False  True]
[False  True]


In [43]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

locked
False    1746
True      189
Name: count, dtype: int64
locked
False    1892
True       13
Name: count, dtype: int64


Given this is a nominal feature and all categories are present in both datasets, we keep it.

### `distinguished`

In [44]:
column = "distinguished"
print(history[column].unique())
print(alternative_history[column].unique())

[nan 'moderator']
[nan 'moderator']


In [45]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

distinguished
moderator    15
Name: count, dtype: int64
distinguished
moderator    1
Name: count, dtype: int64


This is a nominal feature and all categories are present in both datasets. However, we drop it because almost 100% of posts are not from moderators.

### `author`

In [46]:
column = "author"
print(history[column].unique())
print(alternative_history[column].unique())

['ByzantineBasileus' nan 'windigo9' 'Nurgleschampion'
 'Extra_Mechanic_2750' 'jamboamericano' 'flobota' 'LifeOfTheUnparty'
 'AutoModerator' 'lalablahblahhaha' 'Justwonderingwhyitis'
 'TheBitcoinShill' 'hanburgundy' 'marketrent' 'Welshhoppo' 'dominiquec'
 'CardiffUni' 'Gurdy0714' 'shotgunsmitty' 'MusicStanMan' 'JDHoare'
 'kdawg_thetruth' 'empalmerro' 'wildeastmofo' 'drinkin_an_stinkin'
 'Prime-Factor' 'QUILTBAGs' 'Agmm-cr' 'AnCanadianHistorian' 'grepnork'
 'Jariiari7' 'Illustrious_Tutor441' 'Commander_Hannibal' 'PooTeeWeet5'
 'Hiversitize' 'cdnhistorystudent' 'BarKnight' 'Tartan_Samurai'
 'paxinfernum' 'AirSky_MC' 'Aryionas' 'FillsYourNiche' 'darrenjyc'
 'Canuhere' 'Gavertamer_' 'Georgy_K_Zhukov' 'eorld' 'InvisibleLemons'
 'ArtOak' 'disse_' 'frostsid' 'chubachus' 'MeatballDom' 'suntzu124'
 'silveryfeather208' 'bak3n3ko' 'vwarb' 'Telecom_VoIP_Fan' 'TainanBoy'
 'detroitbadboy2' 'Diazepam' 'Fun-Engineering761' 'hod_cement_edifices'
 'Demderdemden' 'dittybopper_05H' 'goodoneforyou' 'frantic

This information may help us make more accurate predictions. Hence, we keep it.

### `num_comments`

In [47]:
column = "num_comments"
print(history[column].unique())
print(alternative_history[column].unique())

[   6 1171  392  854   67  743  407  475  103   22  610  428  882   32
 1032   45  680  170    4   35  343  784  289   14  158 1128  733   28
  405  458   20 1286  173   13   47  142    8  145  224  122 6403   10
    2   15    5  119   34   23   31 1191  109  225  150  235 1314  233
  262   41   89  526  324  260   98   24  419  546   58  247  698    9
  549  298  518  390    0  374  308  345   21  474   25  745   76  640
  271  202  416   18   73  404   16  401  444 1405  556  508 1088   11
    7 1321   19  164  632 2008   94    3  988  685  577    1  423   43
  218  101   36   95  901  238  741  381  978  633  422  578  568  151
  141  502   53  138   78  599   77   26  754  134  297  200  239  450
   42 1165  582  766  534  172   39  349 2282  606 1052  264  818   54
 1261  429   91  136 1107  984  670  230  459  133  908  220 1064  836
 1871   37  601   59  307  530  507  167  999  476   30   97 1502  157
  424  683  148  585 1980   71  259 2904   66  140  831   72  187  182
 1530 

Given this is a numeric column and we can observe some variance in both datasets, we keep it.

### `send_replies`

In [48]:
column = "send_replies"
print(history[column].unique())
print(alternative_history[column].unique())

[ True False]
[ True False]


In [49]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

send_replies
True     1522
False     413
Name: count, dtype: int64
send_replies
True     1737
False     168
Name: count, dtype: int64


Given this is a nominal feature and all categories are present in both datasets, we keep it.

### `stickied`

In [50]:
column = "stickied"
print(history[column].unique())
print(alternative_history[column].unique())

[False  True]
[False  True]


In [51]:
print(history[column].value_counts())
print(alternative_history[column].value_counts())

stickied
False    1933
True        2
Name: count, dtype: int64
stickied
False    1904
True        1
Name: count, dtype: int64


This is a nominal feature and all categories are present in both datasets. However, we drop it because almost 100% of posts are not stickied.

### `url`

In [52]:
column = "url"
print(history[column].unique())
print(alternative_history[column].unique())

['https://www.youtube.com/watch?v=GnRNWyf1Rtw'
 'https://www.reddit.com/r/history/comments/8gh2ju/with_the_surprising_number_of_commenters_in/'
 'https://youtu.be/I2arAuvNZYg' ...
 'https://www.ndtv.com/world-news/over-2-000-mummified-sheep-heads-unearthed-in-egypt-temple-3894644/amp/1'
 'https://news.cnrs.fr/articles/a-historical-treasure-bordering-ancient-mesopotamia'
 'https://www.bbc.com/future/article/20240227-how-julius-caesar-made-the-longest-year-in-history-and-brought-us-leap-years']
['https://i.redd.it/wmdvbuqno7y51.jpg'
 'https://edition.cnn.com/2013/04/19/world/meast/israel-ancient-structure-mystery/'
 'https://www.youtube.com/watch?v=yUJGLf2OUT4' ...
 'https://i.redd.it/wm4h2ovsgyu41.jpg'
 'https://thechroniclesofhistory.com/2022/06/12/lost-cities-hidden-for-centuries-were-just-discovered-in-bolivia/'
 'https://www.reddit.com/r/AlternativeHistory/comments/1ahjylj/egyptian_eye_of_horus_hieroglyph_mirrors_cranial/']


This is an interesting column. Unlike `domain`, `url` offers a way to scrape more text for a post with a non-null `url`. This does add more complexity to the model--like expired links and different html templates--but may be worthy to examine in the future. For now, we decide not to use `url`.

### `num_crossposts`

In [53]:
column = "num_crossposts"
print(history[column].unique())
print(alternative_history[column].unique())

[ 3  7  1  0  4  2  5  6  8 10 31 14 21  9 12 11 28 15 42 13 49 25 17]
[ 0  1  2  3  4  5  6  8  7 15  9 11]


Given this is a numeric column and we can observe some variance in both datasets, we keep it.
<br>
Note, from this feature we learn that history posts have much higher outreach than alternative history posts.