# Keyword Extraction Using LDA Algorithm

In [53]:

#*****************************************************************#
#Cleaning the Data ----Preprocessing
#*****************************************************************#

import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
#LDA Module which can Resolve the Error 
#import gensim 
#from gensim import corpora
#import pyLDAvis
#import pyLDAvis.gensim
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

import nltk
nltk.download('wordnet')

# Load the Excel file into a pandas dataframe
df = pd.read_excel('dataset.xlsx')
#print(df)
# Remove irrelevant columns
df = df[['category', 'username', 'captions', 'hashtags']]

# Handle missing values
df.dropna(inplace=True)

# Remove punctuation and special characters
df['captions'] = df['captions'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df['hashtags'] = df['hashtags'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Define the pattern to match numeric values
pattern = r'\d+'

# Remove numeric values from captions column
df['captions'] = df['captions'].apply(lambda x: re.sub(pattern, '', x))

# Remove numeric values from hashtags column
df['hashtags'] = df['hashtags'].apply(lambda x: re.sub(pattern, '', x))

# Convert text to lowercase
df['captions'] = df['captions'].apply(lambda x: x.lower())
df['hashtags'] = df['hashtags'].apply(lambda x: x.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
df['captions'] = df['captions'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['hashtags'] = df['hashtags'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
#print(df)
# Lemmatize the text
lemmatizer = WordNetLemmatizer()
df['captions'] = df['captions'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df['hashtags'] = df['hashtags'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Create a corpus for each page separately
corpora = []
pages = df['username'].unique()
for page in pages:
    page_df = df[df['username'] == page]
    page_corpus = page_df['captions'] + ' ' + page_df['hashtags']
    corpora.append(page_corpus)

#WordtoVec vector representation
#Clustering technique 

#*****************************************************************#
#Performing Topic Modelling ---LDA to get the keywords per page 
#*****************************************************************#
    
# Apply LDA per page
num_topics =  8 # Specify the number of topics to extract for each page
keyword_results = []
for corpus in corpora:
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()

    lda_model = LatentDirichletAllocation(n_components=num_topics)
    lda_matrix = lda_model.fit_transform(tfidf_matrix)
    # print(lda_matrix)
    top_keywords = []
    for topic_idx, topic in enumerate(lda_model.components_):
        # print(lda_model.components_)
        # print("--------------------Topicidx")
        # print(topic_idx)
        topic_keywords = [feature_names[i] for i in topic.argsort()[:-6:-1]]  # Get top 5 keywords
        top_keywords.append(topic_keywords)
        # top_keywords.append((topic_keywords, lda_model.transform(tfidf_matrix)))

    keyword_results.append(top_keywords)
# print(lda_model.components_)
# print (keyword_results)

#*****************************************************************#
#Performing Topic Modelling ---to get the keywords per Category 
#*****************************************************************#
    
category_keywords = {}
for i, page in enumerate(pages):
    category = df.loc[df['username'] == page, 'category'].iloc[0]
    if category not in category_keywords:
        category_keywords[category] = []
    category_keywords[category].extend(keyword_results[i])

# Flatten the keywords list
category_keywords_flat = {category: [keyword for sublist in keywords for keyword in sublist]
                          for category, keywords in category_keywords.items()}

# Perform second iteration of topic modeling for each category
category_topics = {}
for category, keywords in category_keywords_flat.items():
    keyword_counts = Counter(keywords)
    top_keywords = keyword_counts.most_common(num_topics)
    category_topics[category] = [keyword for keyword, count in top_keywords]

# Print the overall topics for each category
for category, topics in category_topics.items():
    print(f"\nOverall topics for category \033[1m '{category}'\033[0m:")
    for i, topic in enumerate(topics):
        #print(f"Topic {i+1}: {', '.join(topic)}")
        print(f"Topic {i+1}:",topic)
        #print(topic)
    #print()


# Calculate the score of each keyword
category_keyword_scores = {}
keyword_frequency_dict={}
for category, keywords in category_topics.items():
    keyword_scores = {}
    sum_keyword_scores = 0
    for keyword in keywords:
        keyword_score = 0
        for i, page_corpus in enumerate(corpora):
            # Calculate the frequency of the keyword in the page corpus
            keyword_frequency = page_corpus.str.count(keyword).sum()
            #print(pages)
            #print(keyword_frequency)

             # Add the keyword frequency to the keyword frequency dictionary
            page_name = pages[i]
            if page_name not in keyword_frequency_dict:
                keyword_frequency_dict[page_name] = {}
            keyword_frequency_dict[page_name][keyword] = keyword_frequency
            #print(keyword_frequency_dict[page_name])
            # Calculate the total number of words in the page corpus
            total_words = len(' '.join(page_corpus.tolist()).split())
            # Calculate the frequency of the keyword normalized by the total number of words
            #normalized_frequency = keyword_frequency / total_words
            #print(lda_matrix.shape[0])
            # Add the normalized frequency of the keyword to the keyword score for the page
            #keyword_score += normalized_frequency * lda_matrix[i][np.argmax(lda_matrix[i])]
        # Calculate the average keyword score across all pages
        #keyword_score /= len(corpora)
        #keyword_scores[keyword] = keyword_score
        #print()
        #sum_keyword_scores += keyword_score
    # Normalize the score of each keyword
    #for keyword in keyword_scores:
        #keyword_scores[keyword] /= sum_keyword_scores
    #category_keyword_scores[category] = keyword_scores

# Print the scores of the top keywords for each category
#for category, keyword_scores in category_keyword_scores.items():
    #print(f"Scores of the top keywords for category '{category}':")
    #for i, (keyword, score) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:num_topics]):
        #print(f"Keyword {i+1}: '{keyword}', Score: {score:.3f}")
    #print()
    
    
#print("---------------------------")
#print(category_topics.items())    
#print("---------------------------")
#print("---------------------------")

#for i in pages:
    #print(keyword_frequency_dict[i])
    
#print("---------------------------Category Topics-------------------------")
   
#print(category_topics)    

#print("---------------------------Pages/keywords/Frequency-------------------------")

#print(keyword_frequency_dict.items())    


#*****************************************************************#
#Creating Excel FIle for passing the data to our Learning Model
#*****************************************************************#

# create an empty list to store the data
data_list = []

# loop through the pages and keywords to extract the data
for page_name, keyword_frequency_dict in keyword_frequency_dict.items():
    # find the category for the current page
    category = None
    for category_name, pages in category_topics.items():
        #print("***********************")
        #print(category_name)
        #print(pages)
        #if page_name in pages:
            category = df.loc[df['username'] == page_name, 'category'].iloc[0]   
            #break
    
    # extract the keywords and frequencies for the page
    keywords = [keyword for keyword, _ in keyword_frequency_dict.items()]
    frequencies = [frequency for _, frequency in keyword_frequency_dict.items()]
    
    # append the data to the list
    data_list.append((page_name, *frequencies,category))

# create a dataframe from the list
df = pd.DataFrame(data_list, columns=["username", *keywords,"category"])

# write the dataframe to an excel file
df.to_excel("trainingset.xlsx", index=False)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Overall topics for category [1m 'food'[0m:
Topic 1: chicken
Topic 2: islamabad
Topic 3: burger
Topic 4: order
Topic 5: islamabadfoodblog
Topic 6: foodie
Topic 7: biryani
Topic 8: cake

Overall topics for category [1m 'GiftShop'[0m:
Topic 1: gift
Topic 2: pakistan
Topic 3: one
Topic 4: day
Topic 5: onlineshopping
Topic 6: world
Topic 7: eid
Topic 8: customized

Overall topics for category [1m 'Clothing'[0m:
Topic 1: winter
Topic 2: online
Topic 3: instores
Topic 4: day
Topic 5: style
Topic 6: fashion
Topic 7: elegance
Topic 8: adorned

Overall topics for category [1m 'Beauty'[0m:
Topic 1: makeup
Topic 2: makeupartist
Topic 3: skin
Topic 4: beauty
Topic 5: look
Topic 6: stunning
Topic 7: lash
Topic 8: winteroutfit

Overall topics for category [1m 'Fitness'[0m:
Topic 1: fitness
Topic 2: workout
Topic 3: exercise
Topic 4: sadiaariffitness
Topic 5: winteroutfit
Topic 6: outfitideas
Topic 7: classy
Topic 8: competition


# Cleaning of Testing File and Creating Testing File for Model

In [54]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter


# Load the Excel file into a pandas dataframe
df = pd.read_excel('testingset.xlsx')

# Remove irrelevant columns
df = df[['username', 'captions', 'hashtags']]

# Handle missing values by replacing NaN with an empty string
df['captions'] = df['captions'].fillna('').astype(str)
df['hashtags'] = df['hashtags'].fillna('').astype(str)

# Remove punctuation and special characters
df['captions'] = df['captions'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df['hashtags'] = df['hashtags'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Convert text to lowercase
df['captions'] = df['captions'].apply(lambda x: x.lower())
df['hashtags'] = df['hashtags'].apply(lambda x: x.lower())

# # Load the Excel file into a pandas dataframe
# df = pd.read_excel('testingset.xlsx')
# #print(df)
# # Remove irrelevant columns
# df = df[['username', 'captions', 'hashtags']]

# # Handle missing values
# #df.dropna(inplace=True)

# # Remove punctuation and special characters
# df['captions'] = df['captions'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
# df['hashtags'] = df['hashtags'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# # Convert text to lowercase
# df['captions'] = df['captions'].apply(lambda x: x.lower())
# df['hashtags'] = df['hashtags'].apply(lambda x: x.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
df['captions'] = df['captions'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['hashtags'] = df['hashtags'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Lemmatize the text
lemmatizer = WordNetLemmatizer()
df['captions'] = df['captions'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df['hashtags'] = df['hashtags'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Create a corpus for each page separately
corpora = []
pages = df['username'].unique()
#print(pages)
for page in pages:
    page_df = df[df['username'] == page]
    page_corpus = page_df['captions'] + ' ' + page_df['hashtags']
    corpora.append(page_corpus)
#print(corpora)



#**********************************************#
# Get the list of column names from the subset
#**********************************************#
df = pd.read_excel('trainingset.xlsx')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
subset_df = df.iloc[:,1 :-1]
#print(subset_df)
column_names = subset_df.columns.tolist()

#print(column_names)
column_dict = {"column_names": column_names}
# print(column_dict)
# Print the column names for each username
# for username, columns in column_dict.items():
#     print(f"Columns for username '{username}': {columns}")
#**********************************************#
#Frequency of the Words Features present in Testing FIle #
#**********************************************#

#print(column_dict.items())

column_frequency_dict={}

for user,keywords in column_dict.items():
    keyword_scores = {}
    sum_keyword_scores = 0
    for keyword in keywords:
        keyword_score = 0
        for i, page_corpus in enumerate(corpora):
            # Calculate the frequency of the keyword in the page corpus
            keyword_frequency = page_corpus.str.count(keyword).sum()
            #print(pages)
            #print(keyword_frequency)

             # Add the keyword frequency to the keyword frequency dictionary
            page_name = pages[i]
            if page_name not in column_frequency_dict:
                column_frequency_dict[page_name] = {}
            column_frequency_dict[page_name][keyword] = keyword_frequency
            #print(column_frequency_dict[page_name])


#**********************************************#
#Forming Testing File to Test the Model #
#**********************************************#
           
# Get the list of unique usernames
usernames = df['username'].unique()
    

# Convert the frequency dictionary into a DataFrame
frequency_df = pd.DataFrame.from_dict(column_frequency_dict)

# Transpose the DataFrame so that the usernames are in the rows and the keywords are in the columns
frequency_df = frequency_df.transpose()

# Add a column for the usernames
frequency_df.insert(0, 'username', frequency_df.index)

# Reset the index
frequency_df = frequency_df.reset_index(drop=True)

# Write the DataFrame to an Excel file
frequency_df.to_excel('modeltesting.xlsx', index=False)
print("\033[1m Excel File Created for Testing the Model to pass in the Model to Test\033[0m")

[1m Excel File Created for Testing the Model to pass in the Model to Test[0m


# Training the Model on Random Forest

In [55]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the data from the CSV file
data = pd.read_excel('trainingset.xlsx')

# Split the data into input features and labels
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

# Create a random forest classifier object
rf = RandomForestClassifier(n_estimators=30, random_state=42)

# Train the random forest model on the data
rf.fit(X, y)

# Save the trained model
joblib.dump(rf, 'random_forest_model.joblib')


['random_forest_model.joblib']

# Testing the Model on Random Forest

In [56]:
import pandas as pd
import joblib
# Load the model
model = joblib.load('random_forest_model.joblib')

# Load the testing data
testing_df = pd.read_excel("modeltesting.xlsx")

# Use all columns except the first as input features
features_df = testing_df.iloc[:, 1:]

#print(features_df)
# Predict the category for each row using the model
predictions = model.predict(features_df)

# Add the predicted category to the original testing dataframe
testing_df["predicted_category"] = predictions

# Save the results to a new excel file
testing_df.to_excel("randomforest_modeltesting_results.xlsx", index=False)


#  Appending the Actual Category of the Pages of Testing File in the Random Forest Model Results File to get Accuracy 

In [57]:
import pandas as pd

# Read the original DataFrame from the file
df = pd.read_excel('testingset.xlsx')

# Get the last column (actual category) and the username column
last_column = df.iloc[:, -1]
username_column = df['username']

# Create a dictionary to store unique username and corresponding actual category
username_category_dict = {}

# Iterate over the username and last column
for username, category in zip(username_column, last_column):
    if username not in username_category_dict:
        # If the username is not already in the dictionary, add it with the corresponding category
        username_category_dict[username] = category

# Read the SVMresults.xlsx file
result_df = pd.read_excel('randomforest_modeltesting_results.xlsx')


# Create a new column to store the actual category based on unique username
result_df['actualcategory'] = result_df['username'].map(username_category_dict)

# Write the updated DataFrame back to the SVMresults.xlsx file
result_df.to_excel('randomforest_modeltesting_results.xlsx', index=False)


# Calculating Performance Metric of Model Random Forest

In [59]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

bold_start = "\033[1m"
bold_end = "\033[0m"

# Load the Excel sheet into a DataFrame
df = pd.read_excel('randomforest_modeltesting_results.xlsx')

#confusion matrix
actual_categories = result_df['actualcategory']
predicted_categories = result_df['predicted_category']
conf_matrix = confusion_matrix(actual_categories, predicted_categories)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Calculate overall accuracy
overall_accuracy = accuracy_score(actual_categories, predicted_categories)
print("\nOverall Accuracy:", overall_accuracy)
print("Accuracy Percentage: {:.2%}".format(overall_accuracy))

# Calculate metrics for each category
categories = df['actualcategory'].unique()
for category in categories:
    # Filter the DataFrame for the current category
    category_df = df[df['actualcategory'] == category]

    # For Multi-Class Classification to get Precision, Recall, F1_score we have to set average='weighted'
    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true=category_df['actualcategory'], y_pred=category_df['predicted_category'], average='weighted')
    recall = recall_score(category_df['actualcategory'], category_df['predicted_category'], average='weighted')
    f1 = f1_score(category_df['actualcategory'], category_df['predicted_category'], average='weighted')


    # Print the metrics for the current category
    text = "Category"
    print(bold_start + text + bold_end + ":", bold_start + str(category) + bold_end)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print()


Confusion Matrix:
[[3 0 0 0 0]
 [0 1 0 0 0]
 [0 0 2 0 0]
 [0 0 0 1 0]
 [0 1 0 0 3]]

Overall Accuracy: 0.9090909090909091
Accuracy Percentage: 90.91%
[1mCategory[0m: [1mfood[0m
Precision: 1.0
Recall: 0.75
F1 Score: 0.8571428571428571

[1mCategory[0m: [1mBeauty[0m
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

[1mCategory[0m: [1mGiftShop[0m
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

[1mCategory[0m: [1mFitness[0m
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

[1mCategory[0m: [1mClothing[0m
Precision: 1.0
Recall: 1.0
F1 Score: 1.0



  _warn_prf(average, modifier, msg_start, len(result))
