In [7]:
import pandas as pd
import spacy
import random

In [9]:
df = pd.read_json('gq_output.json')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7495 entries, 0 to 7494
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      7491 non-null   datetime64[ns]
 1   topic     7491 non-null   object        
 2   author    7441 non-null   object        
 3   title     7476 non-null   object        
 4   bodytext  7495 non-null   object        
 5   h2text    7495 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 351.5+ KB


In [None]:
df.head(20)

In [6]:
df['topic'].value_counts()

topic
Style                    3776
Grooming                 1207
Wellness                  924
Fitness                   649
Shopping                  417
Culture                   162
Sex and Relationships     147
GQ Style                   49
Sponsor Content            48
Lifestyle                  42
GQ Sports                  34
Women                      15
Travel & Eats              10
Watches                     9
shopping                    1
Coronavirus                 1
Name: count, dtype: int64

In [None]:
# taking a look at Sponsor Content
df[df['topic'] == 'Sponsor Content'].head(20)

In [None]:
# remove Sponsor Content
df = df[df['topic'] != 'Sponsor Content']

In [33]:
df['topic'].value_counts()

topic
Style                    3776
Grooming                 1207
Wellness                  924
Fitness                   649
Shopping                  417
Culture                   162
Sex and Relationships     147
GQ Style                   49
Lifestyle                  42
GQ Sports                  34
Women                      15
Travel & Eats              10
Watches                     9
shopping                    1
Coronavirus                 1
Name: count, dtype: int64

In [None]:
# copilot: remove this string from 'bodytext'
df['bodytext'] = df['bodytext'].apply(
    lambda x: [s.replace(
        "All products featured on GQ are independently selected by our editors. However, we may receive compensation from retailers and/or from purchases of products through these links.", 
        ""
    ) if isinstance(s, str) else s for s in x] if isinstance(x, list) else x
)

In [12]:
# create dataset for training
# isolate strings from body and subheadings
df_textonly = df[['bodytext']].copy()

# copilot: join the list of strings in 'bodytext' into a single string per row
df_textonly['bodytext'] = df_textonly['bodytext'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

In [13]:
# copilot: remove rows where 'bodytext' is empty or only whitespace
df_textonly = df_textonly[df_textonly['bodytext'].str.strip().astype(bool)]

In [58]:
df_textonly.head()

Unnamed: 0,bodytext
0,Few lifestyle choices come with as much cultur...
1,for the summer takes serious commitment. But ...
2,Every morning for the past three weeks I have...
3,"It is 2024, and we're talking about macronutr..."
4,"Rich, 33, has always admired butts—on everyon..."


In [None]:
# checking that empty rows were removed
df_textonly.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6674 entries, 0 to 7494
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   bodytext  6674 non-null   object
dtypes: object(1)
memory usage: 104.3+ KB


In [14]:
# copilot: remove leading whitespace from each row in 'bodytext'
df_textonly['bodytext'] = df_textonly['bodytext'].str.lstrip()

In [15]:
# copilot: replace multiple consecutive whitespace characters with a single space in 'bodytext'

import re

df_textonly['bodytext'] = df_textonly['bodytext'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [16]:
# copilot: remove whitespace before each comma and period in 'bodytext'
df_textonly['bodytext'] = df_textonly['bodytext'].apply(lambda x: re.sub(r'\s+([,.])', r'\1', x))

In [17]:
# save as text file for training in colab
df_textonly.to_csv('df_textonly.txt', index=False, header=False)

### NLP with spacy

In [19]:
# turning the article text into a list

texts = df['bodytext'].to_list()
texts[random.randint(0, len(texts))] # check a random list entry

['Remember when you could just blog your way to worldwide fame?',
 'I’m exaggerating, but only a little: between 2008 and 2014, there was a glimmering promise that anyone with a dot-blogspot URL and a friend to take decent photos of them could land in the front row of Paris Fashion Week’s most coveted shows, in major advertising campaigns, and in roles advising brands on what to sell and how to sell it. The carefully calibrated machinery of fashion world approval was suddenly threatened, and with it the years-long process of masthead ascension. ',
 ' Now, that kind of brazen dedication to extreme personal style isn’t something to be feared—it’s the industry’s defining narrative.',
 'Kenzo, the French brand run since 2010 by Opening Ceremony honchos Humberto Leon and Carol Lim, was perhaps the most emblematic fashion brand of that moment, beloved for a tiger-logo sweater. Today, ',
 ', it’s worth looking back on what, exactly, that sweater meant for the fashion world.',
 'One way to ens

In [24]:
nlp = spacy.load("en_core_web_sm")

# concatenate a specific article into one string
# docs = nlp(" ".join(texts[100]))

# concatenate a random article into one string
random_index = random.randint(0, len(texts) - 1)
print(random_index)
docs = nlp(" ".join(texts[random_index]))

7361


In [25]:
for ent in docs.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

two 253 256 CARDINAL
John Galliano’s 636 651 PERSON
Margiela 656 664 PERSON
Paris 696 701 GPE
Wednesday 705 714 DATE
Gender 731 737 PERSON
Galliano 791 799 ORG
first 820 825 ORDINAL
Margiela’s 832 842 PERSON
Lemaire 975 982 PERSON
Lanvin 987 993 PERSON
Paris 1014 1019 GPE
Wednesday 1023 1032 DATE
Margiela 1034 1042 PERSON
Galliano 1147 1155 PERSON
London 1279 1285 GPE
Wednesday 1582 1591 DATE
Galliano 1668 1676 ORG
late summer 1996 2007 DATE
Galliano 2139 2147 ORG
Galliano 2399 2407 ORG
Raf and Demna 2751 2764 ORG
Virgil 2769 2775 PERSON
Galliano 2913 2921 ORG
Galliano 2927 2935 ORG
last summer 3235 3246 DATE
Galliano 3248 3256 ORG
Paris 3300 3305 GPE
January 3309 3316 DATE
a week later 3367 3379 DATE
Wednesday 3398 3407 DATE
Galliano 3808 3816 ORG
Maison Margiela 3902 3917 PERSON
Galliano 3943 3951 ORG
this season 4055 4066 DATE
Margiela 4128 4136 PERSON
H&M 4150 4153 ORG
Zara 4158 4162 PERSON
Margiela 4266 4274 PERSON
Galliano 4326 4334 ORG


In [80]:
type(df_textonly)

pandas.core.frame.DataFrame

In [26]:
with open('df_textonly.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

In [27]:
type(text_data)

str

In [29]:
len(text_data)

32001243

In [32]:
max_length = 1000000  # spaCy's max length (text_data is too long)
chunks = [text_data[i:i+max_length] for i in range(0, len(text_data), max_length)]

docs = []
for chunk in chunks:
    docs.append(nlp(chunk))

In [None]:
# iterate through the smaller chunks
for doc in docs: 
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
# pull out full sentences only, there were too many fragments
sentences = []
for doc in docs: 
    for i in doc.sents:
        sentences.append(str(i))

In [None]:
# export for training
with open('sentences.txt', 'w', encoding='utf-8') as f:
    for sentence in sentences:
        f.write(sentence.strip() + '\n')

In [41]:
import chardet
with open('cleaned.csv', 'rb') as f:
    result = chardet.detect(f.read(1000000))
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [56]:
dfcleaned = pd.read_csv('cleaned.csv', encoding='utf-8')

In [57]:
dfcleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1143 non-null   object
 1   length  1143 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 18.0+ KB


In [58]:
dfcleaned.head()

Unnamed: 0,text,length
0,"Masculinity is often seen as a monolithic, hom...",179
1,Men need a good pair of shoes for a variety of...,172
2,"Nice guys are going to wear the same pants, an...",170
3,"A real man, I really can't tell you much about...",169
4,Nice guys are so focused on their looks and gr...,168


In [59]:
dfcleaned.drop_duplicates(subset=['text'], keep='first', inplace=True)

In [60]:
dfcleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1082 entries, 0 to 1142
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1082 non-null   object
 1   length  1082 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 25.4+ KB


In [62]:
dfcleaned.head()

Unnamed: 0,text,length
0,"Masculinity is often seen as a monolithic, hom...",179
1,Men need a good pair of shoes for a variety of...,172
2,"Nice guys are going to wear the same pants, an...",170
3,"A real man, I really can't tell you much about...",169
4,Nice guys are so focused on their looks and gr...,168


In [63]:
dfcleaned.to_csv('cleaned_unique.csv', index=False, encoding='utf-8', header=True)