<a href="https://colab.research.google.com/github/kleczekr/dtc/blob/main/dtc_count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Code for implementing POS-counter for the database listing content of the Days to Come internet magazine

The code below shows an easy way of obtaining detailed POS data for articles from the Days to Come internet magazine hosted by tour marketplace TourRadar.

In [None]:
import spacy
from collections import Counter
import pandas as pd

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv('path_to_file', 
                 converters={'categories': eval, 
                             'tags': eval})

### Creating a separate column with spaCy object

In [None]:
df['spacy'] = 0

In [None]:
def spacify(df_):
  df_['spacy'] = nlp(df_['paragraph_text'])
  return df_

In [None]:
df = df.apply(spacify, axis=1)

### Creating a list populated by POS counts for each of the paragraphs from the DF

In [None]:
pos_list = []

In [None]:
for paragraph in df.spacy:
  poscount = Counter()
  for sentence in paragraph.sents:
    poscount['sentence_count'] += 1
    for token in sentence:
      # count parts of speech
      poscount[token.pos_] += 1
      poscount['token_count'] += 1
  # the following three lines count the repeated words
  words = [token.text for token in paragraph if token.is_stop != True and token.is_punct != True and token.is_space != True]
  word_freq = Counter(words)
  poscount['repeat_words'] = word_freq.most_common(3)
  pos_list.append(poscount)

### Converting list to a DataFrame and joining it with the preexisting DataFrame

In [None]:
pos_df = pd.DataFrame(pos_list).fillna(0)

In [None]:
df_pos = pd.concat([df.reset_index(drop=True), pos_df.reset_index(drop=True)], axis=1)

In [None]:
# still many paragraphs which are basically empty---let's remove them
df_pos = df_pos[df_pos['paragraph_text'].map(len) > 1]