In [1]:
import csv
import pandas as pd
import re
import string
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from sklearn.decomposition import LatentDirichletAllocation, NMF

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
df = pd.read_csv('genshintwittermaintenanceone.csv')
df = df.drop(columns = ['Unnamed: 0', 'outlinks', 'tcooutlinks'])

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

def cleanurlhashtags(input):

    text = re.sub(r"http\S+", "", input)
    text = re.sub(r"#\S+", "", text)
    return text

def tostring(input):
    return str(input)

df['content'] = df['content'].apply(tostring)
df['content'] = df['content'].apply(cleanurlhashtags)
df['content'] = df['content'].map(alphanumeric).map(punc_lower)
df['content'] = df['content'].fillna(value='')

content = df['content'].to_list()

tfidf = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)

tweet_word_matrix = tfidf.fit_transform(content)

vocab = tfidf.get_feature_names()

nmf = NMF(n_components=10)

nmf.fit(tweet_word_matrix)

NMF(n_components=10)

In [12]:
tweet_topic_matrix = nmf.transform(tweet_word_matrix)
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')
tweet_topic_matrix_df['content'] = df['content']

word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')

def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_words))[topic]

top_words(word_topic_matrix_df, 'topic_9', 10)

finally      2.507410
got          0.907942
star         0.207944
mona         0.151787
finished     0.128381
good         0.102744
ar           0.093589
home         0.091338
character    0.085605
took         0.069324
Name: topic_9, dtype: float64

Number : Topic  
0 : Youtube Streaming impact-genshin-youtube  
1 : Twitch Streaming live-going-twitch  
2 : General Appreciation just-like-game  
3 : Featured Rare Characters childe-zhongli-tartaglia  
4 : Reddit Links genshinimpact-genshinmemepact-genshin   
5 : Character Appreciation love-kaeya-man  
6 : General Streaming come-say-hi  
7 : Successful Gacha got-cute-klee  
8 : Running Gag paimon-emergency-food  
9 : Successful Featured Gacha finally-got-mona