In [194]:
# import basic libraries 
import pandas as pd
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
import unicodedata
import html
import re

# read csv
df = pd.read_csv('../csv/data_clean.csv')
df_prep = df.copy()

In [195]:
# columns we want in the df
df_prep = df_prep[['channel_name', 'user', 'real_name', 'participant', 'text',
       'text_length']]

In [196]:
df_prep.head()

Unnamed: 0,channel_name,user,real_name,participant,text,text_length
0,general,U01S79YDELR,Karina Condeixa,student,Hang told me to add it in education,35
1,general,U01S79YDELR,Karina Condeixa,student,What improved my score was adding metrics of a...,98
2,general,U01RRV4JX6Z,Francisco Ebeling,student,I feel like a slave to this dumb Resume Worded...,261
3,general,U01S79YDELR,Karina Condeixa,student,"Francisco, we have to remove the fancy/beautif...",76
4,general,U01RRV4JX6Z,Francisco Ebeling,student,"Ah, ok!",7


### Preprocessing

In [197]:
# change column dtype into string
df_prep['text'] = df_prep['text'].astype(str)

# change column to list
text_list = df_prep['text'].tolist()

# # set of stopwords to be removed from text
# stop = set(stopwords.words('english'))

# # update stopwords to have punctuation too
# stop.update(list(string.punctuation))


# utility function for preprocessing the texts
def clean_text(text_list):
    
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text_list.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # strip html
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("<@>", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
#     # remove stopwords from text
#     final_text = []
#     for word in text.split():
#         if word.strip().lower() not in stop:
#             final_text.append(word.strip().lower())
    
#     text = " ".join(final_text)
    
    # lemmatize words
#     lemmatizer = WordNetLemmatizer()    
#     text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
#     text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    
    return text.lower()

In [198]:
# apply cleaning function
df_prep['prep_text'] = df_prep['text'].apply(clean_text)
df_prep.head(100)

Unnamed: 0,channel_name,user,real_name,participant,text,text_length,prep_text
0,general,U01S79YDELR,Karina Condeixa,student,Hang told me to add it in education,35,hang told me to add it in education
1,general,U01S79YDELR,Karina Condeixa,student,What improved my score was adding metrics of a...,98,what improved my score was adding metrics of a...
2,general,U01RRV4JX6Z,Francisco Ebeling,student,I feel like a slave to this dumb Resume Worded...,261,i feel like a slave to this dumb resume worded...
3,general,U01S79YDELR,Karina Condeixa,student,"Francisco, we have to remove the fancy/beautif...",76,francisco we have to remove the fancybeautiful...
4,general,U01RRV4JX6Z,Francisco Ebeling,student,"Ah, ok!",7,ah ok
...,...,...,...,...,...,...,...
95,general,U01RN7BVD1C,Josephine Biedermann,student,I think today is normal retro at 9\nThe whole ...,90,i think today is normal retro at the whole sta...
96,general,U01S7BM4N81,Thamo,student,Thank you phine :blush:,23,thank you phine blush
97,general,U01RN7BVD1C,Josephine Biedermann,student,:slightly_smiling_face: I hope I'm correct tho...,109,slightlysmilingface i hope im correct though b...
98,general,U01SJKB2MG8,Florian Titze,teacher,"You are correct, Phine!",23,you are correct phine


In [199]:
# Function to remove emoji.
def emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df_prep['prep_text'] = df_prep['prep_text'].apply(remove_emoji)

In [200]:
# save to excel
df_sent.to_excel(r'../csv/data_clean_topic_modelling.xlsx', index = False)
# save to csv
df_sent.to_csv(r'../csv/data_clean_topic_modelling.csv', index = False)

## Topic Modelling
Use TF-IDF Vectorization to create a vectorized document term matrix

### Preprocessing

In [201]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [202]:
tfidf = TfidfVectorizer(max_df=0.75, min_df=2, stop_words='english')


In [203]:
dtm = tfidf.fit_transform(df_prep['prep_text'].apply(lambda x: np.str_(x)))

In [204]:
df_prep['prep_text']

0                     hang told me to add it in education
1       what improved my score was adding metrics of a...
2       i feel like a slave to this dumb resume worded...
3       francisco we have to remove the fancybeautiful...
4                                                   ah ok
                              ...                        
4935                                 just saw this thanks
4936    ill have a think need to develop a strategy to...
4937                        heeeellpppanyone ursrend eyes
4938          i will have a look after the presentation v
4939    i quite like the kaggle explanations been read...
Name: prep_text, Length: 4940, dtype: object

### Non-negative Matrix Factorization

Using Scikit-Learn create an instance of NMF with 10 expected components.

In [205]:
from sklearn.decomposition import NMF

In [206]:
nmf_model = NMF(n_components=10,random_state=42)

In [207]:
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

### 15 most common words for each of the 10 topics

In [208]:
len(tfidf.get_feature_names())

3112

In [209]:
import random

In [210]:
for i in range(20):
    random_word_id = random.randint(0,3301)
    print(tfidf.get_feature_names()[random_word_id])

options
aim
staying
rename
standard
passionate
enabled
dependent
reach


IndexError: list index out of range

In [211]:
single_topic = nmf_model.components_[0]

In [212]:
# Top 15 words for this topic:
single_topic.argsort()[-15:]
top_word_indices = single_topic.argsort()[-15:]

In [213]:
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

jup
ones
difference
apparently
lucky
run
zero
interpolate
turn
staying
dont
beginning
null
values
nan


In [214]:
# let's view all the 10 topics found.
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['jup', 'ones', 'difference', 'apparently', 'lucky', 'run', 'zero', 'interpolate', 'turn', 'staying', 'dont', 'beginning', 'null', 'values', 'nan']


THE TOP 15 WORDS FOR TOPIC #1
['learn', 'meet', 'specific', 'room', 'sense', 'file', 'later', 'python', 'using', 'did', 'uskcldf', 'disappointed', 'ah', 'thats', 'yes']


THE TOP 15 WORDS FOR TOPIC #2
['worked', 'francisco', 'trick', 'urnbvdc', 'kosta', 'clarification', 'works', 'ill', 'cool', 'sian', 'flo', 'heart', 'alex', 'thamo', 'thanks']


THE TOP 15 WORDS FOR TOPIC #3
['figure', 'collab', 'zoom', 'worked', 'slot', 'awesome', 'looks', 'works', 'idea', 'sam', 'feedback', 'great', 'got', 'cool', 'slightlysmilingface']


THE TOP 15 WORDS FOR TOPIC #4
['sure', 'right', 'want', 'work', 'lab', 'use', 'time', 'know', 'dont', 'need', 'did', 'data', 'think', 'im', 'just']


THE TOP 15 WORDS FOR TOPIC #5
['easier', 'hope', 'thats', 'problem', 'github', 'want', 'today', 'look', 'thamo', 'haha', 'thx', 'got', 'ahhh

### Attaching Discovered Topic Labels to Original Articles


In [215]:
dtm

<4940x3112 sparse matrix of type '<class 'numpy.float64'>'
	with 26890 stored elements in Compressed Sparse Row format>

In [216]:
dtm.shape

(4940, 3112)

In [217]:
len(df_prep)

4940

In [218]:
topic_results = nmf_model.transform(dtm)

In [219]:
topic_results.shape

(4940, 10)

In [220]:
df_prep['text_topic'] = topic_results.argmax(axis=1)

In [221]:
df_prep.head(10)

Unnamed: 0,channel_name,user,real_name,participant,text,text_length,prep_text,text_topic
0,general,U01S79YDELR,Karina Condeixa,student,Hang told me to add it in education,35,hang told me to add it in education,4
1,general,U01S79YDELR,Karina Condeixa,student,What improved my score was adding metrics of a...,98,what improved my score was adding metrics of a...,4
2,general,U01RRV4JX6Z,Francisco Ebeling,student,I feel like a slave to this dumb Resume Worded...,261,i feel like a slave to this dumb resume worded...,9
3,general,U01S79YDELR,Karina Condeixa,student,"Francisco, we have to remove the fancy/beautif...",76,francisco we have to remove the fancybeautiful...,4
4,general,U01RRV4JX6Z,Francisco Ebeling,student,"Ah, ok!",7,ah ok,5
5,general,U01S79YDELR,Karina Condeixa,student,"I just copied each session. without any table,...",99,i just copied each session without any table j...,9
6,general,U01RRV4JX6Z,Francisco Ebeling,student,"Yeah, I did this. It has improved, but not qui...",70,yeah i did this it has improved but not quiet ...,4
7,general,U01RRV4JX6Z,Francisco Ebeling,student,Did you put career mid level?,29,did you put career mid level,4
8,general,U01S79YDELR,Karina Condeixa,student,take a look in each feature you need most to i...,101,take a look in each feature you need most to i...,4
9,general,U01S79YDELR,Karina Condeixa,student,"I put first as junior, later as mid. I had bet...",67,i put first as junior later as mid i had bette...,4
