accessing processed data and loading into a Pandas DataFrame

In [2]:
import pandas as pd

In [3]:
# accessing stored processed data in JSON format
df = pd.read_json('data/articles.json', orient='records')

In [4]:
df.shape

(1262, 4)

In [5]:
df.head()

Unnamed: 0,title,link,text,article_type
0,Children and COVID-19 Vaccination Trends,https://www.aap.org/en/pages/2019-novel-corona...,Summary of data publicly reported by the Cente...,science
1,COVID-19 State-Level Data Reports,https://www.aap.org/en/pages/2019-novel-corona...,"On May 11, 2023, the United States ended the P...",science
2,Prevention Papillomavirus can cause 6 types of...,https://www.cancer.org/cancer/risk-prevention/...,Our highly trained specialists are available 2...,science
3,COVID-19,https://www.lung.org/lung-health-diseases/lung...,Can we help you find more info? Start by selec...,science
4,End Youth Vaping Let\'s join together to end t...,https://www.lung.org/quit-smoking/end-youth-va...,Research – Youth Vaping and Lung Health The Am...,science


In [6]:
# drop title, link column as they are not relevant to model training
df = df.drop(columns=["title", "link"])

In [7]:
df.shape

(1262, 2)

In [8]:
df['article_type'].value_counts()

article_type
conspiracy    689
science       573
Name: count, dtype: int64

change class label of output from "science", "conspiracy" to 0 and 1

In [9]:
df['article_type'] = df['article_type'].map({
    "science": 1,
    "conspiracy": 0
})

In [10]:
df.head()

Unnamed: 0,text,article_type
0,Summary of data publicly reported by the Cente...,1
1,"On May 11, 2023, the United States ended the P...",1
2,Our highly trained specialists are available 2...,1
3,Can we help you find more info? Start by selec...,1
4,Research – Youth Vaping and Lung Health The Am...,1


In [11]:
df['article_type'].value_counts()

article_type
0    689
1    573
Name: count, dtype: int64

create function to process text

In [12]:
from src.model.text_processor import TextProcessor
text_processor = TextProcessor()

In [13]:
# remove, added this to pre_processing
df = df[df['text'] != ""]

In [14]:
df.shape

(1234, 2)

In [15]:
df['embedding_vector'] = df['text'].apply(lambda x: text_processor.process(x))

In [16]:
df

Unnamed: 0,text,article_type,embedding_vector
0,Summary of data publicly reported by the Cente...,1,"[0.005152532, 0.0041832207, 0.0013592248, 0.02..."
1,"On May 11, 2023, the United States ended the P...",1,"[-0.004177533, -0.009758532, -0.01115001, 0.03..."
2,Our highly trained specialists are available 2...,1,"[-0.011494388, -0.009370449, 0.008532594, 0.01..."
3,Can we help you find more info? Start by selec...,1,"[-0.0018576796, 0.028474232, -0.013181964, 0.0..."
4,Research – Youth Vaping and Lung Health The Am...,1,"[-0.01785805, 0.0006542458, 0.0037484693, 0.03..."
...,...,...,...
1257,We may earn commission from links on this page...,0,"[0.0018057713, 0.022749502, -0.00049236, 0.030..."
1258,"By Norma Erickson February 7, 2011 For Georgio...",0,"[-0.011514382, 0.0060215113, 0.016591966, 0.02..."
1259,Search by keyword Search by Category Search he...,0,"[-0.003980596, 0.012227758, -0.00442995, 0.026..."
1260,Search by keyword Search by Category Search he...,0,"[-0.003980596, 0.012227758, -0.00442995, 0.026..."


serializing the DataFrame for model training

In [17]:
df.to_pickle('data/model/data.pkl')