In [1]:
import pandas as pd

df = pd.read_csv('data/youtube_metadata.csv',delimiter='\t',encoding='utf-8')
df=df.dropna(subset='subtitle')
subtitle=df['subtitle'].tolist()


In [2]:
from umap import UMAP
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words="english")
umap_model = UMAP(n_neighbors=15, 
                  n_components=3, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples = 10, metric='euclidean', prediction_data=True)
kmeans_model = KMeans(n_clusters=15)
model = BERTopic(vectorizer_model=vectorizer_model,language='english',umap_model=umap_model,hdbscan_model=kmeans_model)
topics = model.fit_transform(subtitle)

In [5]:
freq = model.get_topic_info()

In [6]:
freq

Unnamed: 0,Topic,Count,Name
0,0,31,0_insulin_glucose_cells_glp1
1,1,27,1_metformin_medication_diabetes_medications
2,2,26,2_like_youre_thats_just
3,3,25,3_grams_sugar_blood_diet
4,4,25,4_diabetes_really_just_im
5,5,24,5_diabetes_people_type_prediabetes
6,6,22,6_insulin_cells_diabetes_type
7,7,21,7_like_know_just_im
8,8,21,8_diabetes_uh_cell_pressure
9,9,20,9_glucose_hemoglobin_a1c_blood


In [7]:
model.get_document_info(subtitle)

Unnamed: 0,Document,Topic,Name,Top_n_words,Representative_document
0,history hey guys so i might be whispering but ...,7,7_like_know_just_im,like - know - just - im - dont - right - reall...,False
1,"When you have diabetes, all fruits are the “fo...",3,3_grams_sugar_blood_diet,grams - sugar - blood - diet - carb - carbohyd...,False
2,in our practice we have an opportunity to trea...,5,5_diabetes_people_type_prediabetes,diabetes - people - type - prediabetes - exerc...,False
3,now we're gonna talk about carb counting and m...,3,3_grams_sugar_blood_diet,grams - sugar - blood - diet - carb - carbohyd...,True
4,common symptoms of diabetes include being very...,8,8_diabetes_uh_cell_pressure,diabetes - uh - cell - pressure - blood - wate...,False
...,...,...,...,...,...
305,polydipsia is excessive thirst or excessive fl...,8,8_diabetes_uh_cell_pressure,diabetes - uh - cell - pressure - blood - wate...,False
306,[Music] people living with type 1 diabetes mus...,3,3_grams_sugar_blood_diet,grams - sugar - blood - diet - carb - carbohyd...,False
307,hello I'm Pam Blackmore dietitian at Joslin Di...,3,3_grams_sugar_blood_diet,grams - sugar - blood - diet - carb - carbohyd...,False
308,">> READING FOODLABELS CAN BE CONFUSING, BUT YO...",3,3_grams_sugar_blood_diet,grams - sugar - blood - diet - carb - carbohyd...,False


In [8]:
model.generate_topic_labels()

['0_insulin_glucose_cells',
 '1_metformin_medication_diabetes',
 '2_like_youre_thats',
 '3_grams_sugar_blood',
 '4_diabetes_really_just',
 '5_diabetes_people_type',
 '6_insulin_cells_diabetes',
 '7_like_know_just',
 '8_diabetes_uh_cell',
 '9_glucose_hemoglobin_a1c',
 '10_pen_needle_insulin',
 '11_pain_neuropathy_nerve',
 '12_blood_sugar_diarrhea',
 '13_skin_like_really',
 '14_acting_insulin_hours']

In [9]:
words=model.get_topic(2) 

for word in words:
    print(word[0],end=' ')

like youre thats just know okay im going dont type 

In [10]:
model.visualize_hierarchy()

In [11]:
model.visualize_barchart(top_n_topics=19)

In [20]:
model.visualize_heatmap( )

In [15]:
med_info=[]
i=1
for topic in model.topics_:
    if topic==2 or topic==3 or topic==7 or topic==13 or topic==14:
        med_info.append("Low Medical Info")
    else:
        med_info.append("High Medical Info")
    i=i+1

In [16]:
df['Medical_info']=med_info

In [17]:
df.to_csv("data/raw_data.csv", sep='\t', encoding='utf-8',index=False)