In [17]:
import pandas as pd
from bs4 import BeautifulSoup

In [18]:
df_original = pd.read_csv("hackernews.csv", parse_dates=["timestamp"])
df_original

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,The worst case for kickstarter is that they fa...,,AJ007,,1347981310,2012-09-18 15:15:10+00:00,comment,4538506,4537746.0,,,
1,,,I&#x27;d have to say the same for Facebook. Mo...,,jorgecurio,,1456001084,2016-02-20 20:44:44+00:00,comment,11141867,11141158.0,,,
2,,,&#62; You can't get San Francisco running effi...,,Cushman,,1359100263,2013-01-25 07:51:03+00:00,comment,5114503,5114256.0,,,
3,,,&gt;illegal to transmit encrypted data over th...,,aoeuasdf,,1633726031,2021-10-08 20:47:11+00:00,comment,28803994,28803811.0,,,
4,,,I actually think ARkit will be big precisely b...,,erikpukinskis,,1501892529,2017-08-05 00:22:09+00:00,comment,14933475,14932842.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4802,,,Or split into steam-the-platform-the-company a...,,jrkatz,,1564069670,2019-07-25 15:47:50+00:00,comment,20526386,20525938.0,,,
4803,,,This isn&#x27;t as serious of a proposal as so...,,yajoe,,1387069124,2013-12-15 00:58:44+00:00,comment,6907828,6907262.0,,,
4804,,,Ex coworkers. Last time I got a job tradition...,,VLM,,1378900686,2013-09-11 11:58:06+00:00,comment,6366593,6365783.0,,,
4805,,,I predict that 0.10.x will be with us for a lo...,,erichocean,,1389925140,2014-01-17 02:19:00+00:00,comment,7073895,7073044.0,,,


In [19]:
def extract_prediction(text):
    soup = BeautifulSoup(text)
    text = soup.text
    text = " ".join(text.split())
    pred_idx = text.find("I predict")
    dot_idx = text[pred_idx:].find(".")

    return text[pred_idx : pred_idx + dot_idx + 1]

In [20]:
df = df_original.copy()
df["prediction"] = df["text"].apply(extract_prediction)
df["year"] = df["timestamp"].dt.year
df = df[df["prediction"] != ""]

In [21]:
# Original extractor was too simple
# Patching dataframe using a new extractor, but without changing the order of the predictions to keep the JSON IDs
from collections import Counter
import re

def extract_prediction_regex(text):
    text = " ".join(text.split())
    text = text.replace("<p>", ". ").replace("i.e.", "ie").replace("e.g.", "eg")
    soup = BeautifulSoup(text)
    text = soup.text
    regex = re.compile(
        r"I predict .*?(?:[.!?](?=\s|\.|.\s)|$)"
    )  # https://stackoverflow.com/questions/73355650/how-to-extract-a-sentence-with-start-marker/73355899#73355899
    matches = regex.findall(text)

    # Edge cases
    if matches:
        prediction = matches[0]
        if prediction.endswith("?") or prediction.endswith(":.") or Counter(prediction)["\""] % 2 != 0:
            return ""
        if (prediction.endswith(").") or prediction.endswith(".)")) and "(" not in prediction:
            return prediction[:-2] + "."
        
        return prediction
    else:
        return ""

df2 = df_original.copy()
df2["prediction"] = df2["text"].apply(extract_prediction_regex)
df2["year"] = df2["timestamp"].dt.year
df2 = df2[df2["prediction"] != ""]
df = df.merge(df2, how="left", on="id", suffixes=(None, "_new"))
df.loc[~df["prediction_new"].isna(), "prediction"] = df.loc[~df["prediction_new"].isna()]["prediction_new"] 
df = pd.concat([df, df2[~df2["id"].isin(df["id"])]])

In [22]:
df.describe()

Unnamed: 0,score,time,id,parent,descendants,ranking,deleted,year,score_new,time_new,parent_new,descendants_new,ranking_new,deleted_new,year_new
count,26.0,4780.0,4780.0,4754.0,24.0,0.0,0.0,4780.0,24.0,4547.0,4523.0,22.0,0.0,0.0,4547.0
mean,4.115385,1485680000.0,15452130.0,15477440.0,5.166667,,,2016.602301,3.875,1485888000.0,15471030.0,5.0,,,2016.608313
std,3.614714,123736500.0,9532073.0,9527034.0,7.069018,,,3.931751,3.663124,123114400.0,9486637.0,7.138094,,,3.910498
min,1.0,1175584000.0,8462.0,8407.0,-1.0,,,2007.0,1.0,1175584000.0,8407.0,-1.0,,,2007.0
25%,1.25,1388594000.0,6995582.0,7015414.0,0.75,,,2014.0,1.0,1389014000.0,7048344.0,0.25,,,2014.0
50%,2.5,1504817000.0,15195150.0,15246390.0,3.0,,,2017.0,2.0,1504626000.0,15208470.0,3.0,,,2017.0
75%,6.25,1591880000.0,23488230.0,23489180.0,6.5,,,2020.0,4.0,1591172000.0,23415650.0,6.0,,,2020.0
max,13.0,1658561000.0,32201210.0,32201040.0,31.0,,,2022.0,13.0,1658561000.0,32201040.0,31.0,,,2022.0


In [23]:
rnd = df.sample(1)
print(
    f"\"{rnd['prediction'].values[0]}\" -{rnd['by'].values[0]} ({rnd['year'].values[0]})"
)

"I predict a decent chance this will be the dominant branch in under a year." -jerf (2014)


In [24]:
import json

generate = True

if generate:
    for i, d in enumerate(
        df[["id", "prediction", "by", "year"]].to_dict(orient="records")
    ):
        with open(f"../predictions/{i}", "w") as f:
            try:
                json.dump(d, f, ensure_ascii=False, indent=4)
            except UnicodeEncodeError as e:
                print(f"{i}\t{e}")


2617	'charmap' codec can't encode character '\ufff8' in position 90: character maps to <undefined>
