In [20]:
import os 
import pandas as pd 
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/joe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/joe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/joe/nltk_data...


True

In [9]:
os.chdir('/home/joe/Documents/schoolwork/soda502/class2_20220830')

comments = pd.read_csv('black names.csv')
gender = pd.read_csv('Gendered Names Data.csv')
data = pd.merge(left=comments, right =gender, on = 'name')
data = data.dropna(subset=['comments'])
data = data.reset_index()

In [18]:
def comment_to_word_tokens(comment):
    comment_text = BeautifulSoup(comment,'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z_]", " ", comment_text)
    words = letters_only.split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    lmtzr = WordNetLemmatizer()
    lemmatized_words = [lmtzr.lemmatize(w) for w in meaningful_words]
    return(" ".join(lemmatized_words))

In [21]:
num_names = data['comments'].size
clean_comments = []

for i in range(0, num_names): 
    if((i + 1) % 500 == 0):
       print('done with another 500 comments') 
    clean_comments.append(comment_to_word_tokens((data["comments"][i])))



done with another 500 comments
done with another 500 comments
done with another 500 comments


In [22]:
name_list = list(data.name)
tfidf_vectorizer = TfidfVectorizer(analyzer = "word", stop_words= name_list, 
                                   max_features=1000, ngram_range=(1,1))

In [23]:
tfidf = tfidf_vectorizer.fit_transform(clean_comments)

lda = LatentDirichletAllocation(n_components = 10, max_iter=5, 
                                learning_offset= 50., random_state=(0))
lda.fit(tfidf) 

tf_feature_names = tfidf_vectorizer.get_feature_names()



In [26]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % topic_idx)
        print(", ".join([feature_names[i] 
                         for i in topic.argsort()[:-n_top_words -1: -1]]))
        
print_top_words(lda, tf_feature_names, 50)

Topic 0:
supposed, spelled, rugged, ghetto, powerful, respect, happen, handsome, awesome, sorry, nice, male, anyone, feel, sound, name, find, masculine, boy, charlie, kat, jo, much, know, enough, set, prefer, bree, changing, also, combo, someday, complete, france, ugh, torus, spelling, reminds, way, terrible, taste, natural, idea, hideous, suggest, pretty, look, put, better, variation
Topic 1:
spelling, prefer, like, name, pretty, pronounced, really, original, le, joke, love, common, gender, confused, way, uh, natural, come, realize, since, lay, reason, see, la, black, look, either, better, supposed, hair, mine, pronunciation, female, fact, though, tell, pronounce, nice, make, much, seems, think, actually, little, soft, prettier, ah, strange, beautiful, boring
Topic 2:
obvious, boring, fat, combo, disagree, syllable, incredibly, combination, sense, two, someone, prefer, sound, first, great, much, pretty, like, name, charlie, kat, jo, rugged, enough, ghetto, set, bree, changing, someday

In [27]:
nmf = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
nmf = NMF(n_components=2, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

print_top_words(nmf, tf_feature_names, 50)



Topic 0:
name, like, think, sound, love, girl, people, really, nickname, boy, would, one, named, pretty, beautiful, also, know, nice, middle, always, daughter, make, first, little, though, old, never, cute, much, child, son, well, good, great, used, say, called, strong, call, use, even, get, meaning, popular, hate, common, masculine, mean, friend, lot
Topic 1:
spelling, like, look, spell, name, spelled, prefer, way, better, pronounced, much, pretty, people, pronounce, make, wrong, spelt, different, love, original, variant, variation, instead, prettier, best, want, think, though, ee, really, feminine, always, end, use, tacky, misspelled, trashy, parent, pronunciation, misspelling, right, form, seems, ugly, actually, dislike, personally, correct, nicer, least




In [28]:
topics = nmf.transform(tfidf)
topic_df = pd.DataFrame(topics)
data = data.join(topic_df)

corr = data.corr()

