# Process data and prepare it for Hugging Face

In [1]:
import pandas as pd
import numpy as np
import sagemaker
import boto3

boto_session = boto3.Session()
region = boto_session.region_name
comprehend = boto3.client('comprehend', region_name=region)
sgmk_session = sagemaker.Session()
sgmk_role = sagemaker.get_execution_role()

In [2]:
ctstories = "s3://funnybones/rural/topics/CTstories.csv"

In [3]:
df1 = pd.read_csv(ctstories)

In [4]:
df1["category"].value_counts()

sport          176
politics       124
arts            98
health          96
lifestyle       90
crime           73
society         69
ignore          46
business        35
realestate      30
human           29
accident        28
environment     26
education       23
science         15
labour          13
military        12
weather          5
transport        4
religion         1
Name: category, dtype: int64

# Low observation categories

Some of the original categories do not have sufficient observations for modelling: [religion, weather, military, labour, science, transport]

For the purposese of the POC, I am collapsing these categories using the following logic:

* religion -> society
* weather -> lifestyle
* military -> politics
* labour -> politics
* science -> education
* transport -> politics

In the process of labelling I identified 46 articles that could not be classified and should be ignored. These were predominantly Letters sections that contained mixed topic content.



In [5]:
df1["category"] = np.where( df1["category"]=="religion", "society", df1["category"])

df1["category"] = np.where( df1["category"]=="weather", "lifestyle", df1["category"])
df1["category"] = np.where( df1["category"]=="military", "politics", df1["category"])
df1["category"] = np.where( df1["category"]=="labour", "politics", df1["category"])
df1["category"] = np.where( df1["category"]=="transport", "politics", df1["category"])

df1["category"] = np.where( df1["category"]=="science", "education", df1["category"])

In [6]:
df_trainer = df1[df1["category"]!="ignore"]

In [7]:
len(df_trainer)

963

In [8]:
df_trainer = df_trainer[ df_trainer["category"].notnull() ]

In [9]:
len(df_trainer)

947

In [10]:
df_trainer["category"].value_counts()

sport          176
politics       153
arts            98
health          96
lifestyle       95
crime           73
society         70
education       38
business        35
realestate      30
human           29
accident        28
environment     26
Name: category, dtype: int64

In [64]:
labels = df_trainer["category"].value_counts().reset_index()

In [65]:
labels.columns = ['topic', 'label']

In [66]:
labels['label']=labels.index

In [67]:
labels.to_csv("data/topic_labels_HF.csv", index=False, header=True)

In [68]:
labels

Unnamed: 0,topic,label
0,sport,0
1,politics,1
2,arts,2
3,health,3
4,lifestyle,4
5,crime,5
6,society,6
7,education,7
8,business,8
9,realestate,9


In [54]:
df_trainer.head()

Unnamed: 0,id,category,summary,tags,text,title
0,7300876,human,Multicultural Hub Canberra has supported the s...,"['bf-label-advertising-feature', 'story-busine...",Model Akiima was born in the small village of...,Model of success - from refugee to the runways
1,7300648,sport,Justis Huni and Paul Gallen finally went head ...,"['domestic-sports', 'top-sport']",This was poetic Justis at its finest. For all...,Poetic Justis: Huni demolishes Gallen
2,7300577,environment,"Mr Bowen declared ""this is a solar panel, don'...","['news', 'subscriber-only', 'federal-politics'...","It has taken four years, but Labor's Chris Bo...","'This is a solar panel, don't be afraid': Labo..."
3,7300512,arts,"Now, most celebrities fly by private jet with ...","['books', 'signpost-review']",Remember when flying was glamorous and ocean ...,Glamorous travel of yesteryear
4,7300496,sport,Tahlia Tupaea will remain in Canberra next sea...,"['capitals', 'basketball', 'signpost-subscribe...",The second youngest debutant in WNBL history ...,Tupaea locked in for her return to the capital


In [73]:
mapping = dict(labels[['topic', 'label']].values)
df_trainer['label'] = df_trainer.category.map(mapping)
df_trainer.head()

Unnamed: 0,id,category,summary,tags,text,title,label
0,7300876,human,Multicultural Hub Canberra has supported the s...,"['bf-label-advertising-feature', 'story-busine...",Model Akiima was born in the small village of...,Model of success - from refugee to the runways,10
1,7300648,sport,Justis Huni and Paul Gallen finally went head ...,"['domestic-sports', 'top-sport']",This was poetic Justis at its finest. For all...,Poetic Justis: Huni demolishes Gallen,0
2,7300577,environment,"Mr Bowen declared ""this is a solar panel, don'...","['news', 'subscriber-only', 'federal-politics'...","It has taken four years, but Labor's Chris Bo...","'This is a solar panel, don't be afraid': Labo...",12
3,7300512,arts,"Now, most celebrities fly by private jet with ...","['books', 'signpost-review']",Remember when flying was glamorous and ocean ...,Glamorous travel of yesteryear,2
4,7300496,sport,Tahlia Tupaea will remain in Canberra next sea...,"['capitals', 'basketball', 'signpost-subscribe...",The second youngest debutant in WNBL history ...,Tupaea locked in for her return to the capital,0


In [98]:
mapping

{'sport': 0,
 'politics': 1,
 'arts': 2,
 'health': 3,
 'lifestyle': 4,
 'crime': 5,
 'society': 6,
 'education': 7,
 'business': 8,
 'realestate': 9,
 'human': 10,
 'accident': 11,
 'environment': 12}

In [74]:
trainset = df_trainer.loc[:,['id','label','text']]
trainset.columns = ['idx', 'label', 'sentence']
trainset.head()

Unnamed: 0,idx,label,sentence
0,7300876,10,Model Akiima was born in the small village of...
1,7300648,0,This was poetic Justis at its finest. For all...
2,7300577,12,"It has taken four years, but Labor's Chris Bo..."
3,7300512,2,Remember when flying was glamorous and ocean ...
4,7300496,0,The second youngest debutant in WNBL history ...


In [75]:
validation = trainset.loc[0:60]

In [76]:
training = trainset.loc[61:]

In [77]:
len(validation) + len(training)

947

In [78]:
len(trainset)

947

In [79]:
training.to_csv("data/training_for_HF.csv", index=False, header=True)
validation.to_csv("data/validation_for_HF.csv", index=False, header=True)

# Separate Test Data

We grabbed the original data set (not from Canberra times) as an independent test set.


In [80]:
stories = "s3://funnybones/rural/topics/stories.csv"
df2 = pd.read_csv(stories)

In [81]:
df2["category"] = np.where( df2["category"]=="religion", "society", df2["category"])
df2["category"] = np.where( df2["category"]=="weather", "lifestyle", df2["category"])
df2["category"] = np.where( df2["category"]=="military", "politics", df2["category"])
df2["category"] = np.where( df2["category"]=="labour", "politics", df2["category"])
df2["category"] = np.where( df2["category"]=="transport", "politics", df2["category"])
df2["category"] = np.where( df2["category"]=="science", "education", df2["category"])

In [87]:
test_data = df2[ df2["category"].notnull() ].copy()


In [88]:
len(test_data)

83

In [89]:
test_data['category'].value_counts()

sport          32
health          9
politics        9
environment     9
business        8
crime           6
lifestyle       3
arts            2
accident        2
human           2
society         1
Name: category, dtype: int64

In [94]:
test_data.head()

Unnamed: 0,id,category,summary,tags,text,title,label
0,7172663,business,"Star of the South has reached an ""important mi...",['news'],"Star of the South has reached an ""important m...",A SHINING STAR - Company reveals route from Ba...,8
1,7172704,politics,The Port of Sale precinct could be transformed...,"['news', 'community']",THE Port of Sale precinct could be transforme...,Grand plan for port,1
2,7172716,health,Wellington Shire residents aged over 70 will b...,['news'],WELLINGTON Shire residents aged over 70 will ...,The over 70s cohort can soon book to get their...,3
3,7172725,lifestyle,Anyone thinking about holidaying at some of th...,"['news', 'community']",ANYONE thinking about holidaying at some of t...,Local visitor boom is expected during looming ...,4
4,7172737,environment,"Star of the South, Australia's proposed first ...",['news'],"STAR of the South, Australia's proposed first...",Route is announced for offshore wind project,12


In [91]:
test_data['label'] = test_data.category.map(mapping)

In [92]:
testset = test_data.loc[:,['id','label','text']]
testset.columns = ['idx', 'label', 'sentence']

In [93]:
testset.head()

Unnamed: 0,idx,label,sentence
0,7172663,8,"Star of the South has reached an ""important m..."
1,7172704,1,THE Port of Sale precinct could be transforme...
2,7172716,3,WELLINGTON Shire residents aged over 70 will ...
3,7172725,4,ANYONE thinking about holidaying at some of t...
4,7172737,12,"STAR of the South, Australia's proposed first..."


In [95]:
testset.to_csv("data/test_for_HF.csv", index=False, header=True)
