# Exploring natural language processing on Reddit comments
The most important part of this notebook is the extraction of product names and types from the appropriate Reddit comment. I looked into various was of topic modeling, but because people often don't write out product names and sometimes the names are misspelled, I really had to hand-hold the process.

In [2]:
import pandas as pd
import numpy as np
import re

## Drop duplicate rows and posts with no comments

In [3]:
curly_df = pd.read_csv('matched_posts.csv')

# Drop duplicates and deleted comments
curly_df.drop_duplicates(subset='sub_id', keep = False, inplace = True) 
curly_df.dropna(subset=['comm_text'], inplace=True)
curly_df = curly_df[curly_df['comm_text'] != '[deleted]']
curly_df = curly_df[curly_df['comm_text'] != '[removed]]']
curly_df.index = range(len(curly_df))

In [4]:
text = [re.sub(r'http\S+', '', t) for t in curly_df['comm_text']] # remove links
text = [re.sub(r'([0-9]+?)', '', t).lower() for t in text] # remove all numbers and symbols
text = [re.sub(r'(!|#|\$|%|\(|\)|\*|\+|,|-|\.|/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|{|\||}|~)+', "", t)
        for t in text]
text = [re.sub(r"(')+", "", t) for t in text]
curly_df['comm_text'] = [re.sub(r'\s+\s', ' ', t).strip() for t in text] # replace double spaces with single spaces

In [6]:
products = pd.read_csv('./curly_products.csv', encoding='utf8')
products.rename(columns={"Unnamed: 0": "product", "Unnamed: 1": "type"}, inplace=True)
products.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)

text = [re.sub(r'([0-1]+?)', '', t).lower() for t in products['product']] # remove all numbers and symbols
products['product'] = [re.sub(r'(\')+', '', t) for t in text]

products['brand']=[t[0] for t in products['product'].str.split()]
products['brand']=products['brand'].str.replace('_', ' ')

text = [re.sub(r'([0-1]+?)', ' ', t).lower() for t in products['brand']] # remove all numbers and symbols
products['brand'] = [re.sub(r"(')+", "", t) for t in text]

In [11]:
types_of_products = products['type'].dropna().unique()
types_of_products = np.append(types_of_products,['nopoo', 'no poo', 'no-poo', 'lopoo', 'lo poo', 'lo-poo','leavein', 
                          'leave in', 'cowash', 'co wash', 'co-wash','creme', 'lowpoo', 'low-poo', 'low poo', 
                            'condition','deep conditioner', 'styler', 'styling', 'milk', 'mask'])
print(types_of_products)

['shampoo' 'conditioner' 'leave-in' 'deep treatment' 'protein' 'cream'
 'gel' 'mousse' 'serum' 'clarifying shampoo' 'nopoo' 'no poo' 'no-poo'
 'lopoo' 'lo poo' 'lo-poo' 'leavein' 'leave in' 'cowash' 'co wash'
 'co-wash' 'creme' 'lowpoo' 'low-poo' 'low poo' 'condition'
 'deep conditioner' 'styler' 'styling' 'milk' 'mask']


In [None]:
curly_df['products_used'] = ''

for i,text in enumerate(curly_df['comm_text']):
    prod = []

    for j,brand in enumerate(products['brand']):
        if re.search(r'\b'+str(brand)+r'\b', str(text)):
            prod.append((str(products['product'].iloc[j]), products['type'].iloc[j]))
            
    prod = list(dict.fromkeys(prod))
    curly_df['products_used'].iloc[i] = prod
    

In [9]:
curly_df['products_used'].iloc[-2]

[('shea_moisture curl and shine shampoo', 'shampoo'),
 ('shea_moisture_coconut and hibiscus curl & shine conditioner',
  'conditioner'),
 ('shea_moisture_coconut and hibiscus curl and style milk', 'cream'),
 ('shea_moisture curl enhancing smoothie', 'cream'),
 ('cantu_moisturizing curl activator cream', 'cream'),
 ('shea_moisture_coconut & hibiscus curl & shine shampoo', 'shampoo')]

In [10]:
curly_df.head(10)

# Save the products extracted from each comment describing the poster's routine to a file
curly_df.to_csv('dataframe_with_products.csv')

## Topic modeling via LDA using spaCy and gensim

In [None]:
# Removing stop words not necessary for my purposes

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

curly_df['comm_text'] = curly_df['comm_text'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
#! python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [None]:
texts = curly_df['comm_text'].tolist()
%time spacy_docs = list(nlp.pipe(texts))

In [None]:
docs = [[t.lemma_.lower() for t in doc if len(t.orth_) > 2 and not t.is_stop] for doc in nlp]

In [None]:
from gensim.models import Phrases

bigram = Phrases(docs, min_count=10)
tokens = []

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:  # bigrams can be recognized by the "_" that joins the invidual words
            docs[idx].append(token)
            tokens.append(token)
            
print(list(set(tokens)))

In [None]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
print('Number of unique words in original documents:', len(dictionary))

dictionary.filter_extremes(no_below=3, no_above=0.25)
print('Number of unique words after removing rare and common words:', len(dictionary))

print("Example representation of document 3:", dictionary.doc2bow(docs[2]))

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
from gensim.models import LdaModel

%time model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, chunksize=500, passes=3, random_state=1)

In [None]:
for (topic, words) in model.print_topics():
    print(topic+1, ":", words, '\n')

In [None]:
import pyLDAvis.gensim
import warnings

pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=DeprecationWarning) 

pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)