In [69]:
import polars as pl
import pandas as pd

In [70]:
test_raw = pl.read_csv('../LIAR/data/test_raw.csv')
train_raw = pl.read_csv('../LIAR/data/train_raw.csv')
valid_raw = pl.read_csv('../LIAR/data/valid_raw.csv')

In [71]:
test_raw = test_raw.with_columns(pl.col('barely_true_counts').cast(pl.Int64))
train_raw = train_raw.with_columns(pl.col('barely_true_counts').cast(pl.Int64))
valid_raw = valid_raw.with_columns(pl.col('barely_true_counts').cast(pl.Int64))

test_raw = test_raw.with_columns(pl.col('false_counts').cast(pl.Int64))
train_raw = train_raw.with_columns(pl.col('false_counts').cast(pl.Int64))
valid_raw = valid_raw.with_columns(pl.col('false_counts').cast(pl.Int64))

test_raw = test_raw.with_columns(pl.col('half_true_counts').cast(pl.Int64))
train_raw = train_raw.with_columns(pl.col('half_true_counts').cast(pl.Int64))
valid_raw = valid_raw.with_columns(pl.col('half_true_counts').cast(pl.Int64))

test_raw = test_raw.with_columns(pl.col('mostly_true_counts').cast(pl.Int64))
train_raw = train_raw.with_columns(pl.col('mostly_true_counts').cast(pl.Int64))
valid_raw = valid_raw.with_columns(pl.col('mostly_true_counts').cast(pl.Int64))

test_raw = test_raw.with_columns(pl.col('pants_on_fire_counts').cast(pl.Int64))
train_raw = train_raw.with_columns(pl.col('pants_on_fire_counts').cast(pl.Int64))
valid_raw = valid_raw.with_columns(pl.col('pants_on_fire_counts').cast(pl.Int64))

In [72]:
data = pl.concat([test_raw, train_raw, valid_raw])
data = data.with_columns(pl.col('label').apply(lambda x: 'true' if x == "true" or x == "mostly-true" else 'false').alias('label'))

In [73]:
data.select(pl.col('label').unique())

label
str
"""true"""
"""false"""


In [74]:
data.head()

id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,str
"""11972.json""","""true""","""Building a wal...","""immigration""","""rick-perry""","""Governor""","""Texas""","""republican""",30,30,42,23,18,"""Radio intervie..."
"""11685.json""","""false""","""Wisconsin is o...","""jobs""","""katrina-shankl...","""State represen...","""Wisconsin""","""democrat""",2,1,0,0,0,"""a news confere..."
"""11096.json""","""false""","""Says John McCa...","""military,veter...","""donald-trump""","""President-Elec...","""New York""","""republican""",63,114,51,37,61,"""comments on AB..."
"""5209.json""","""false""","""Suzanne Bonami...","""medicare,messa...","""rob-cornilles""","""consultant""","""Oregon""","""republican""",1,1,3,1,1,"""a radio show"""
"""9524.json""","""false""","""When asked by ...","""campaign-finan...","""state-democrat...",,"""Wisconsin""","""democrat""",5,7,2,2,7,"""a web video"""


In [75]:
selected_data = data.select([
    pl.col('label'),
    pl.col('statement'),
])
selected_data.head()

label,statement
str,str
"""true""","""Building a wal..."
"""false""","""Wisconsin is o..."
"""false""","""Says John McCa..."
"""false""","""Suzanne Bonami..."
"""false""","""When asked by ..."


In [76]:
from preprocess import text_preprocess

In [77]:
processed_text = text_preprocess(selected_data['statement'])

100%|██████████| 12791/12791 [00:04<00:00, 2833.37it/s]


In [78]:
processed_selected_data = selected_data.with_columns(pl.Series(processed_text)).rename({'statement': 'text', '': 'processed_text'})

In [79]:
processed_selected_data

label,text,processed_text
str,str,list[str]
"""true""","""Building a wal...","[""build"", ""wall"", ... ""year""]"
"""false""","""Wisconsin is o...","[""wisconsin"", ""pace"", ... ""year""]"
"""false""","""Says John McCa...","[""say"", ""john"", ... ""vet""]"
"""false""","""Suzanne Bonami...","[""suzanne"", ""bonamici"", ... ""senior""]"
"""false""","""When asked by ...","[""ask"", ""reporter"", ... ""yes""]"
"""true""","""Over the past ...","[""past"", ""year"", ... ""employee""]"
"""true""","""Says that Tenn...","[""say"", ""tennessee"", ... ""tax""]"
"""false""","""Says Vice Pres...","[""say"", ""vice"", ... ""package""]"
"""true""","""Donald Trump i...","[""donald"", ""trump"", ... ""want""]"
"""false""","""We know that m...","[""know"", ""half"", ... ""foundation""]"


In [80]:
name = "liar"

text_df = processed_selected_data.to_pandas()

text_df.to_csv(f"./datasets/{name}.csv", index=False)
print("File created", f"./datasets/{name}.csv")

text_df.to_parquet(f"./datasets/{name}.parquet.gzip", index=False)
print("File created", f"./datasets/{name}.parquet.gzip")

text_df.to_pickle(f"./datasets/{name}.pkl")
print("File created", f"./datasets/{name}.pkl")

print({"shape": text_df.shape})
print("=================================")


File created ./datasets/liar.csv
File created ./datasets/liar.parquet.gzip
File created ./datasets/liar.pkl
{'shape': (12791, 3)}


In [81]:
processed_selected_data.select(pl.col('processed_text').arr.lengths()).describe()

describe,processed_text
str,f64
"""count""",12791.0
"""null_count""",0.0
"""mean""",9.710109
"""std""",5.841837
"""min""",1.0
"""max""",301.0
"""median""",9.0


In [82]:
processed_selected_data

label,text,processed_text
str,str,list[str]
"""true""","""Building a wal...","[""build"", ""wall"", ... ""year""]"
"""false""","""Wisconsin is o...","[""wisconsin"", ""pace"", ... ""year""]"
"""false""","""Says John McCa...","[""say"", ""john"", ... ""vet""]"
"""false""","""Suzanne Bonami...","[""suzanne"", ""bonamici"", ... ""senior""]"
"""false""","""When asked by ...","[""ask"", ""reporter"", ... ""yes""]"
"""true""","""Over the past ...","[""past"", ""year"", ... ""employee""]"
"""true""","""Says that Tenn...","[""say"", ""tennessee"", ... ""tax""]"
"""false""","""Says Vice Pres...","[""say"", ""vice"", ... ""package""]"
"""true""","""Donald Trump i...","[""donald"", ""trump"", ... ""want""]"
"""false""","""We know that m...","[""know"", ""half"", ... ""foundation""]"
