In [1]:
import pandas as pd
import numpy as np

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob # sentiment analysis

from  nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# !pip install spacymoji
from spacymoji import Emoji # deal with emojis

# !pip install emot --upgrade
# https://github.com/NeelShah18/emot
import emot

## Import Data

In [3]:
scrape = pd.read_csv('../data/pre_nlp_data.csv')

In [4]:
titles = scrape.loc[:,['title']]

## NLP

### Add word length, character length attributes

In [5]:
def word_count(instr):
    
    # remove all double spaces
    count = 0
    while '  ' in instr and count < 1_000_000 :
        instr = instr.replace('  ',' ')
        count += 1
    
    return len(instr.split(' '))

In [6]:
titles['title_length'] = titles['title'].map(lambda x: len(x))

In [7]:
titles['title_word_count'] = titles['title'].map(lambda x: word_count(x))

### Spacy

In [8]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')
nlp.add_pipe("emoji")

<spacymoji.Emoji at 0x1e5f08e13d0>

In [9]:
titles['docs'] = titles['title'].map(lambda x: nlp(x))

#### Add Sentiment Scores

In [10]:
titles['polarity'] = titles['docs'].map(lambda x: x._.polarity)

In [11]:
titles['subjectivity'] = titles['docs'].map(lambda x: x._.subjectivity)

#### Add emojis

In [12]:
titles['has_emoji'] = titles['docs'].map(lambda x: x._.has_emoji)

In [13]:
titles['number_of_emoji'] = titles['docs'].map(lambda x: len(x._.emoji))

In [14]:
titles['emojis'] = titles['docs'].map(lambda x: x._.emoji)

#### Remove Emojis

In [15]:
# !pip install clean-text
# https://www.educative.io/edpresso/how-to-remove-emoji-from-the-text-in-python
from cleantext import clean

In [16]:
titles['title'] = titles['title'].map(lambda x: clean(x, no_emoji=True))

#### Text Smileys

In [17]:
# see github link by import emot
emot_obj = emot.core.emot()

In [18]:
titles['has_emoticon'] = titles['title'].map(lambda x: emot_obj.emoticons(x)['flag'])
titles['number_of_emoticons'] = titles['title'].map(lambda x: len(emot_obj.emoticons(x)['value']))
titles['emoticons'] = titles['title'].map(lambda x: emot_obj.emoticons(x))

#### Parts of Speech counts

In [19]:
pos_lst = [[w.pos_ for w in nlp(t)] for t in titles.iloc[:,0]]

In [20]:
# https://stackoverflow.com/questions/722697/best-way-to-turn-word-list-into-frequency-dict
pos_parts = ['ADJ','ADP','ADV','AUX','CONJ','CCONJ','DET','INTJ','NOUN','NUM'\
             ,'PART','PRON','PROPN','PUNCT','SCONJ','SYM','VERB','X','SPACE']

pos_counts = [{i:pos_lst[t].count(i) for i in pos_parts} for t in range(len(pos_lst))]


In [21]:
pos_df = pd.DataFrame(pos_counts)
pos_df = pos_df.join(pos_df.sum(axis=1).rename("sum_pos"))

In [22]:
# drop conj, space
pos_df.drop(['SPACE','CONJ'], axis = 1, inplace = True)


In [23]:
pos_perc_df = pd.DataFrame([pos_df.apply(lambda row: 0 if row.sum_pos == 0
                                                     else row[col] / row.sum_pos
                                                 , axis = 1)
                            for col in pos_df.columns[:-1]]).T

pos_perc_df.columns = [col+'_percent' for col in pos_df.columns[:-1]]

In [24]:
# join back to pos_df
pos_df = pos_df.join(pos_perc_df)

### No Stem, No Lemma

#### TF-IDF Vectorize

In [25]:
# instantiate a tfidf vectorizer
tvec = TfidfVectorizer(
                       stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 200,
                       max_df = .60,
                       norm = 'l2'
        )

# fit our vectorizer
tvec.fit(titles['title'])

# create tfidf matrix
tvec_naut = pd.DataFrame(tvec.transform(titles['title']).todense(),
                           columns = tvec.get_feature_names())

In [26]:
tvec_naut.columns[:10]

Index(['10', '100', '2022', 'ago', 'american', 'anon', 'art', 'away', 'baby',
       'bad'],
      dtype='object')

### Porter Stem

In [27]:
ps = PorterStemmer()

In [28]:
title_stemmed = [' '.join([ps.stem(w.text) for w in nlp(t)])
                           for t in titles['title']]

#### TF-IDF Vectorize

In [29]:
# instantiate a tfidf vectorizer
tvec = TfidfVectorizer(
                       stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 200,
                       max_df = .60,
                       norm = 'l2'
        )

# fit our vectorizer
tvec.fit(title_stemmed)

# create tfidf matrix
tvec_stem = pd.DataFrame(tvec.transform(title_stemmed).todense(),
                           columns = tvec.get_feature_names())

In [30]:
tvec_stem.columns[:10]

Index(['10', '2022', 'actual', 'age', 'ago', 'alway', 'american', 'ani',
       'anim', 'anoth'],
      dtype='object')

In [31]:
# remove '10' because it could be any number due to stemming
tvec_stem.drop('10', axis=1, inplace=True)

### Lemmatize

In [32]:
title_lemma = [' '.join([w.lemma_ for w in nlp(t)])
                           for t in titles['title']]

#### TF-IDF Vectorize

In [33]:
# instantiate a tfidf vectorizer
tvec = TfidfVectorizer(
                       stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 200,
                       max_df = .60,
                       norm = 'l2'
        )

# fit our vectorizer
tvec.fit(title_lemma)

# create tfidf matrix
tvec_lemma = pd.DataFrame(tvec.transform(title_lemma).todense(),
                           columns = tvec.get_feature_names())

In [34]:
tvec_lemma.columns[0:10]

Index(['10', '100', '2022', 'age', 'ago', 'american', 'anon', 'art', 'ask',
       'attack'],
      dtype='object')

**Observation** Keep 10, 100 because they are the original values and may have value.

### Lemmatize and Stem

In [35]:
title_lem_stem = [' '.join([ps.stem(w.text) for w in nlp(t)])
                           for t in title_lemma]

#### TF-IDF Vectorize

In [36]:
# instantiate a tfidf vectorizer
tvec = TfidfVectorizer(
                       stop_words = 'english',
                       strip_accents = 'ascii',
                       max_features = 200,
                       max_df = .60,
                       norm = 'l2'
        )

# fit our vectorizer
tvec.fit(title_lem_stem)

# create tfidf matrix
tvec_lem_stem = pd.DataFrame(tvec.transform(title_lem_stem).todense(),
                           columns = tvec.get_feature_names())

In [37]:
tvec_lem_stem.columns[0:10]

Index(['10', '2022', 'actual', 'age', 'ago', 'alway', 'american', 'ani',
       'anim', 'anoth'],
      dtype='object')

In [38]:
# remove '10' because it could be any number due to stemming
tvec_lem_stem.drop('10', axis=1, inplace=True)

### Produce output files

In [39]:
titles.drop(['title','docs','emojis','emoticons'], axis = 1, inplace=True)

In [40]:
titles = titles.join(pos_df.drop('sum_pos', axis = 1))

In [41]:
nlp_titles_naut = titles.join(tvec_naut)
nlp_titles_stem = titles.join(tvec_stem)
nlp_titles_lemma = titles.join(tvec_lemma)
nlp_titles_lem_stem = titles.join(tvec_lem_stem)

#### Add back remaining attributes from scrapes

In [42]:
scrape_nlp_naut = scrape.drop('title', axis=1).join(nlp_titles_naut)
scrape_nlp_stem = scrape.drop('title', axis=1).join(nlp_titles_stem)
scrape_nlp_lemma = scrape.drop('title', axis=1).join(nlp_titles_lemma)
scrape_nlp_lem_stem = scrape.drop('title', axis=1).join(nlp_titles_lem_stem)

### Save and Export

In [43]:
scrape_nlp_naut.to_csv('../data/nlp_naut_data.csv', index = False)
scrape_nlp_stem.to_csv('../data/nlp_stem_data.csv', index = False)
scrape_nlp_lemma.to_csv('../data/nlp_lemma_data.csv', index = False)
scrape_nlp_lem_stem.to_csv('../data/nlp_lem_stem_data.csv', index = False)