In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random 
from pathlib import Path
import jellyfish

In [2]:
cwd = os.getcwd()

In [3]:
post_path = cwd +  "\parler_2020-01-06_posts-partial\posts"

Let's take 3 random samples from within the list of files and then we can go ahead and check those to see if the files are all the same.

In [5]:
%%time
rand_samp_1 = random.choices(os.listdir(post_path), k=1000)

CPU times: total: 2.73 s
Wall time: 2.78 s


In [6]:
%%time
rand_samp_2 = random.choices(os.listdir(post_path), k=1000)

CPU times: total: 2.53 s
Wall time: 2.54 s


In [7]:
%%time
rand_samp_3 = random.choices(os.listdir(post_path), k=1000)

CPU times: total: 2.55 s
Wall time: 2.55 s


We're then going through and just pulling the files and their text using a dict comp for each of the random samples.

In [12]:
%%time 
rand_files_1 = {file: Path(post_path+"\\"+file).read_text(encoding="utf-8") for file in rand_samp_1}

CPU times: total: 156 ms
Wall time: 153 ms


In [9]:
%%time 
rand_files_2 = {file: Path(post_path+"\\"+file).read_text(encoding="utf-8") for file in rand_samp_2}

CPU times: total: 406 ms
Wall time: 4.95 s


In [10]:
%%time 
rand_files_3 = {file: Path(post_path+"\\"+file).read_text(encoding="utf-8") for file in rand_samp_3}

CPU times: total: 453 ms
Wall time: 5.11 s


Then let's go ahead and make dataframes from each of those and clean them up before calculating the distance using Hamming Distance. While Hamming Distance isn't exactly the best distance metric period, it is very fast and is likely to be a good enough metric here since we're just trying to figure out if the files are identical.

In [14]:
rand_df_1 = pd.DataFrame.from_dict(rand_files_1, orient="index")
rand_df_2 = pd.DataFrame.from_dict(rand_files_2, orient="index")
rand_df_3 = pd.DataFrame.from_dict(rand_files_3, orient="index")

In [16]:
rand_df_1 = rand_df_1.rename(columns={0:"html"})
rand_df_2 = rand_df_2.rename(columns={0:"html"})
rand_df_3 = rand_df_3.rename(columns={0:"html"})

In [26]:
%%time

# each time comparing them to the first element in the list of files; if they're identical, the dist should be 0 across all
rand_df_1["comp_dist"] = rand_df_1["html"].apply(lambda x: jellyfish.hamming_distance(x, rand_df_1["html"].head(1).values[0])) 
rand_df_2["comp_dist"] = rand_df_2["html"].apply(lambda x: jellyfish.hamming_distance(x, rand_df_2["html"].head(1).values[0])) 
rand_df_3["comp_dist"] = rand_df_3["html"].apply(lambda x: jellyfish.hamming_distance(x, rand_df_3["html"].head(1).values[0])) 

CPU times: total: 172 ms
Wall time: 177 ms


Finally, let's look at those distance values. We can clearly see that since the distances aren't all 0, that the files aren't all identical. If we wanted to be more serious, there are some simple statistical things we could do, but for here, this is sufficient for us to say that the files aren't all identical. 

In [30]:
rand_df_1["comp_dist"].value_counts()

10258    59
9729      5
9558      4
9648      4
9453      4
         ..
9316      1
10926     1
11202     1
9297      1
8517      1
Name: comp_dist, Length: 756, dtype: int64

In [31]:
rand_df_2["comp_dist"].value_counts()

10730    70
10192     5
10210     5
10180     5
10202     5
         ..
9815      1
12070     1
10755     1
9717      1
9810      1
Name: comp_dist, Length: 714, dtype: int64

In [32]:
rand_df_3["comp_dist"].value_counts()

11617    43
10766     5
10950     5
10577     5
10792     5
         ..
10482     1
16387     1
11117     1
7056      1
10714     1
Name: comp_dist, Length: 664, dtype: int64