### About the dataset
https://www.kaggle.com/datasets/kanchana1990/global-news-engagement-on-social-media?resource=download

### Follow the commands in the url below to import the Kaggle dataset into jupyter:
https://saturncloud.io/blog/how-to-import-kaggle-datasets-into-jupyter-notebook/#3

In [None]:
!kaggle datasets download -d kanchana1990/global-news-engagement-on-social-media

In [None]:
!unzip global-news-engagement-on-social-media.zip

### Read the datasets

In [None]:
import pandas as pd

In [None]:
df_aljazeera = pd.read_csv('al_jazeera.csv')
df_aljazeera['channel'] = 'aljazeera'

df_bbc = pd.read_csv('bbc.csv')
df_bbc['channel'] = 'bbc'

df_cnn = pd.read_csv('cnn.csv')
df_cnn['channel'] = 'cnn'

df_reuters = pd.read_csv('reuters.csv')
df_reuters['channel'] = 'reuters'

In [None]:
df_list = [df_aljazeera, df_bbc, df_cnn, df_reuters]

In [None]:
df = pd.concat(df_list)

In [None]:
len(df)

In [None]:
df.head()

### Creating words frequency dataset

### Extract topics from data
https://towardsdatascience.com/let-us-extract-some-topics-from-text-data-part-i-latent-dirichlet-allocation-lda-e335ee3e5fa4
https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

In [None]:
'''
Cleaning and Preprocessing of the data
'''

# importing required libraries 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string


# compile documents
doc_complete = df['text'].tolist()

#print('\n\nData\n\n')
#print(doc_complete)

# set of stopwords
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(str(doc)).split() for doc in doc_complete]    


print('\n\nCleaned Data\n\n')
print(doc_clean)

In [None]:
def flatten_extend(matrix):
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list

In [None]:
flat_doc_clean = flatten_extend(doc_clean)

In [None]:
res = {key: flat_doc_clean.count(key) for key in flat_doc_clean}

In [None]:
res

### LDA

In [None]:
# Importing gensim
import gensim
from gensim import corpora

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

In [None]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

### Running LDA Model

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [None]:
# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=100, id2word = dictionary, passes=50)

In [None]:
print(ldamodel.print_topics(num_topics=100, num_words=3))