In [9]:
import pandas as pd
import spacy
import random

In [3]:
df = pd.read_json('gq_output.json')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7495 entries, 0 to 7494
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      7491 non-null   datetime64[ns]
 1   topic     7491 non-null   object        
 2   author    7441 non-null   object        
 3   title     7476 non-null   object        
 4   bodytext  7495 non-null   object        
 5   h2text    7495 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 351.5+ KB


In [None]:
df.head(20)

In [6]:
df['topic'].value_counts()

topic
Style                    3776
Grooming                 1207
Wellness                  924
Fitness                   649
Shopping                  417
Culture                   162
Sex and Relationships     147
GQ Style                   49
Sponsor Content            48
Lifestyle                  42
GQ Sports                  34
Women                      15
Travel & Eats              10
Watches                     9
shopping                    1
Coronavirus                 1
Name: count, dtype: int64

In [None]:
# taking a look at Sponsor Content
df[df['topic'] == 'Sponsor Content'].head(20)

In [32]:
df = df[df['topic'] != 'Sponsor Content']

In [33]:
df['topic'].value_counts()

topic
Style                    3776
Grooming                 1207
Wellness                  924
Fitness                   649
Shopping                  417
Culture                   162
Sex and Relationships     147
GQ Style                   49
Lifestyle                  42
GQ Sports                  34
Women                      15
Travel & Eats              10
Watches                     9
shopping                    1
Coronavirus                 1
Name: count, dtype: int64

In [40]:
# copilot: remove the specified string from 'bodytext'
df['bodytext'] = df['bodytext'].apply(
    lambda x: [s.replace(
        "All products featured on GQ are independently selected by our editors. However, we may receive compensation from retailers and/or from purchases of products through these links.", 
        ""
    ) if isinstance(s, str) else s for s in x] if isinstance(x, list) else x
)

In [56]:
# create dataset for training
# isolate strings from body and subheadings
df_textonly = df[['bodytext']].copy()

# copilot: join the list of strings in 'bodytext' into a single string per row
df_textonly['bodytext'] = df_textonly['bodytext'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

In [57]:
# copilot: remove rows where 'bodytext' is empty or only whitespace
df_textonly = df_textonly[df_textonly['bodytext'].str.strip().astype(bool)]

In [58]:
df_textonly.head()

Unnamed: 0,bodytext
0,Few lifestyle choices come with as much cultur...
1,for the summer takes serious commitment. But ...
2,Every morning for the past three weeks I have...
3,"It is 2024, and we're talking about macronutr..."
4,"Rich, 33, has always admired butts—on everyon..."


In [None]:
# checking that empty rows were removed
df_textonly.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6674 entries, 0 to 7494
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   bodytext  6674 non-null   object
dtypes: object(1)
memory usage: 104.3+ KB


In [62]:
# copilot: remove leading whitespace from each row in 'bodytext'
df_textonly['bodytext'] = df_textonly['bodytext'].str.lstrip()

In [None]:
# copilot: replace multiple consecutive whitespace characters with a single space in 'bodytext'

import re

df_textonly['bodytext'] = df_textonly['bodytext'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [67]:
# copilot: remove whitespace before each comma and period in 'bodytext'
df_textonly['bodytext'] = df_textonly['bodytext'].apply(lambda x: re.sub(r'\s+([,.])', r'\1', x))

In [None]:
# save as text file for training in colab
df_textonly.to_csv('df_textonly.txt', index=False, header=False)

### NLP with spacy

In [70]:
# turning the article text into a list

texts = df['bodytext'].to_list()
texts[random.randint(0, len(texts))] # check a random list entry

['',
 'There’s just nobody doing it like New Balance in 2025. The ',
 ' and their ',
 ' have cooked up ',
 '. One such reliable partner? ',
 '. The legendary Italian sportswear brand has been a perfect match for New Balance since the pair first joined forces back in 2021. Their work has spanned a bevy of bodacious silhouettes, but this latest installment is arguably their best yet.',
 'Next week, Stone Island’s take on the New Balance 272 hits stores. The 272 is a skate shoe that’s been in New Balance’s lineup for a minute—a pared-down no-frills sneaker with a vulcanized rubber sole and simple color paneling. It’s pure functionality that happens to look good without trying too hard.',
 'New Balance',
 'Stone Island',
 'Stone Island’s gorpy, functional aesthetic injects the 272 with rocket fuel. The silhouette’s paneling splits the upper in half, using subtle two-tone contrast coloring in three renditions: off-white, blue, and yellow, each made of a combination of canvas and suede. The 

In [71]:
nlp = spacy.load("en_core_web_sm")

# concatenate a specific article into one string
# docs = nlp(" ".join(texts[100]))

# concatenate a random article into one string
random_index = random.randint(0, len(texts) - 1)
docs = nlp(" ".join(texts[random_index]))

In [73]:
for ent in docs.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

96 percent 168 178 PERCENT
Fortune 500 182 193 PRODUCT
Wolverine 291 300 ORG
one 648 651 CARDINAL
100,000 percent 726 741 PERCENT
every minute 754 766 TIME
18 1003 1005 CARDINAL
intel 1433 1438 ORG
first 1941 1946 ORDINAL
first 3368 3373 ORDINAL
more than three seconds 3396 3419 TIME
tonight 3652 3659 TIME
SEXY 5415 5419 ORG
Reese 5644 5649 NORP
un 5710 5712 ORG


In [80]:
type(df_textonly)

pandas.core.frame.DataFrame