In [79]:
import os
import pandas as pd

In [3]:
DATA_PATH = './data/'
DATA_PATH = os.path.abspath('./data/')

## Read Data

In [20]:
# transform label
def transform_dataframe_label(df: pd.DataFrame) -> pd.DataFrame:
    label_map = {
        "true": "true",
        "false": "false",
        "half-true": "true",
        "pants-fire": "false",
        "barely-true": "false",
        "mostly-true": "true",
    }

    df["label"] = df["label"].apply(lambda x: label_map[x])
    return df


In [24]:
files = list(filter(lambda x: x.endswith('.tsv') ,os.listdir(DATA_PATH)))

for file in files:
    # Read and rename fields
    df = pd.read_csv(os.path.join(DATA_PATH, file), sep='\t', header=None)
    df.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

    # transform raw file and save into csv format
    df.to_csv(os.path.join(DATA_PATH, file.replace('.tsv', '_raw.csv')), index=False, header=True)
    
    df = transform_dataframe_label(df)
    
    # transform and save into csv format
    df.to_csv(os.path.join(DATA_PATH, file.replace('.tsv', '.csv')), index=False, header=True)
    

## Text Preprocessing

https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing

In [72]:
# Extract text data
df_text = df[['statement']].astype('str')
df_text.columns = ['text']

In [73]:
# Lower casing
df_text['text'] = df_text['text'].str.lower()
df_text.head()

Unnamed: 0,text
0,building a wall on the u.s.-mexico border will...
1,wisconsin is on pace to double the number of l...
2,says john mccain has done nothing to help the ...
3,suzanne bonamici supports a plan that will cut...
4,when asked by a reporter whether hes at the ce...


In [80]:
# remove word in list
def remove_words(text: str, removal_str: 'set[str]', is_char:bool = False) -> str:
    if is_char:
        return "".join([char for char in list(text) if char not in removal_str])
    return " ".join([word for word in text.split() if word not in removal_str])

from collections import Counter

cnt = Counter()
for text in df_text["text"].values:
    for word in text.split():
        cnt[word] += 1


In [86]:
import string
from nltk.corpus import stopwords

n_rare_words = 10

# Preprocess pipeline the text form dataframe
def text_preprocess(texts: 'pd.Series[str]') -> 'pd.Series[str]':
    # Remove punctuation
    PUNCT_TO_REMOVE = string.punctuation
    texts = texts.apply(
        lambda text: remove_words(text, set(PUNCT_TO_REMOVE), is_char=True)
    )

    STOPWORDS = set(stopwords.words("english"))
    FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
    RAREWORDS = set([w for (w, wc) in cnt.most_common()[: -n_rare_words - 1 : -1]])

    removal_pipeline = [STOPWORDS, FREQWORDS, RAREWORDS]

    for words in removal_pipeline:
        texts = texts.apply(lambda text: remove_words(text, words))

    return texts


In [88]:
df_text['text'] = text_preprocess(df_text['text'])
df_text

Unnamed: 0,text
0,building wall usmexico border take literally
1,wisconsin pace double number layoffs
2,john mccain done nothing help vets
3,suzanne bonamici supports plan cut choice medi...
4,asked reporter whether hes center criminal sch...
...,...
1262,budget provides highest funding level history ...
1263,ive almost every day
1264,early 1980s sen kennedy offered help leaders r...
1265,epa permit new epa director got done two days


In [90]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df_text["text_stemmed"] = df_text["text"].apply(lambda text: stem_words(text))
df_text.head()


Unnamed: 0,text,text_stemmed
0,building wall usmexico border take literally,build wall usmexico border take liter
1,wisconsin pace double number layoffs,wisconsin pace doubl number layoff
2,john mccain done nothing help vets,john mccain done noth help vet
3,suzanne bonamici supports plan cut choice medi...,suzann bonamici support plan cut choic medicar...
4,asked reporter whether hes center criminal sch...,ask report whether he center crimin scheme vio...


In [94]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df_text['text_lemmatized'] = df_text['text'].apply(lambda text: lemmatize_words(text))
df_text.head()

Unnamed: 0,text,text_stemmed,text_lemmatized
0,building wall usmexico border take literally,build wall usmexico border take liter,building wall usmexico border take literally
1,wisconsin pace double number layoffs,wisconsin pace doubl number layoff,wisconsin pace double number layoff
2,john mccain done nothing help vets,john mccain done noth help vet,john mccain done nothing help vet
3,suzanne bonamici supports plan cut choice medi...,suzann bonamici support plan cut choic medicar...,suzanne bonamici support plan cut choice medic...
4,asked reporter whether hes center criminal sch...,ask report whether he center crimin scheme vio...,asked reporter whether he center criminal sche...


## Spacy

In [111]:
# Extract text data
df_text = df[['statement']].astype('str')
df_text.columns = ['text']

In [114]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')

In [124]:
text = df_text['text'][64]
text

'Obamacare insurance cooperative failures should be expected because theyre like any business, and when you start businesses in America, at the fifth year, half of the businesses have closed.'

In [127]:
doc = nlp(text)
olist = []
for token in doc:
    l = [token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_]
    olist.append(l)
    
odf = pd.DataFrame(olist)
odf.columns= ["Text", "StartIndex", "Lemma", "IsPunctuation", "IsSpace", "WordShape", "PartOfSpeech", "POSTag"]
odf


Unnamed: 0,Text,StartIndex,Lemma,IsPunctuation,IsSpace,WordShape,PartOfSpeech,POSTag
0,Obamacare,0,obamacare,False,False,Xxxxx,ADJ,JJ
1,insurance,10,insurance,False,False,xxxx,NOUN,NN
2,cooperative,20,cooperative,False,False,xxxx,ADJ,JJ
3,failures,32,failure,False,False,xxxx,NOUN,NNS
4,should,41,should,False,False,xxxx,AUX,MD
5,be,48,be,False,False,xx,AUX,VB
6,expected,51,expect,False,False,xxxx,VERB,VBN
7,because,60,because,False,False,xxxx,SCONJ,IN
8,they,68,they,False,False,xxxx,PRON,PRP
9,re,72,re,False,False,xx,VERB,VBP
