In [1]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

In [2]:
df = pd.read_csv(r"C:\Users\johna\anaconda3\envs\twitter-analytics-env\twitter_issues_dashboard\twitter_issues_dashboard\data\01_raw\tweets_details2023-03-15_20-43-36.csv")

In [16]:
def clean_text(text):
    # convert input to str
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove ASCII characters
    text = re.sub(r'[^\x00-\x7f]', '', text)
    
    # Remove @ symbols and # symbols
    text = re.sub(r'[@#]\w+', '', text)
    
    # Remove variations of "royal mail"
    text = re.sub(r'royal\s*mail', '', text, flags=re.IGNORECASE)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the cleaned tokens back into a string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text


# Apply the clean_text function to the 'text' column in your DataFrame
df['cleaned_text'] = df['text'].apply(clean_text)


In [13]:
df.head(5)

Unnamed: 0,tweet_id,creation_date,text,media_url,video_url,language,favorite_count,retweet_count,reply_count,quote_count,...,user.profile_banner_url,user.description,user.external_url,user.number_of_tweets,user.bot,user.timestamp,user.has_nft_avatar,user,detail,cleaned_text
0,7.485231e+17,Thu Jun 30 14:26:32 +0000 2016,#royalmail offers Compulsory new “Chargeable” ...,,,en,0.0,0.0,0.0,0.0,...,https://pbs.twimg.com/profile_banners/29701699...,Selro provides all the tools to start and grow...,http://www.selro.com,1036.0,False,1239193000.0,False,,,offer Compulsory new Chargeable IntegratorCont...
1,7.484731e+17,Thu Jun 30 11:07:57 +0000 2016,Great #friends realize your #love of all thing...,,,en,1.0,0.0,0.0,0.0,...,https://pbs.twimg.com/profile_banners/14168888...,,http://our-labour-of-love.blogspot.com/,1973.0,False,1368150000.0,False,,,Great friend realize love thing written Love g...
2,7.485466e+17,Thu Jun 30 15:59:48 +0000 2016,My #dilevery of #easyskinz with #royalmail #ip...,,,en,1.0,0.0,0.0,0.0,...,https://pbs.twimg.com/profile_banners/20078001...,instagram: thimothy,,1117.0,False,1286693000.0,False,,,dilevery easyskinz iphone6 mehappy blackmamba ...
3,7.485004e+17,Thu Jun 30 12:56:24 +0000 2016,#royalmail ridiculous rule not being able to s...,,,en,0.0,0.0,0.0,0.0,...,https://pbs.twimg.com/profile_banners/24632482...,Consumer rights activist— holding companies to...,,16688.0,False,1398434000.0,False,,,ridiculous rule able send perfume etc post eve...
4,7.485809e+17,Thu Jun 30 18:16:08 +0000 2016,📮 #RoyalMail MarketReach Report Dispels #Direc...,['https://pbs.twimg.com/media/CmL0Z4qWIAAJDDe....,,en,1.0,0.0,0.0,0.0,...,https://pbs.twimg.com/profile_banners/30078697...,#UK based #printing company. https://t...,http://www.printmr.co.uk,9769.0,False,1305716000.0,False,,,MarketReach Report Dispels DirectMail Engageme...


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Create a CountVectorizer object to convert the cleaned text into a bag-of-words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])

# Train the LDA model
lda_model = LatentDirichletAllocation(n_components=200, random_state=42)
lda_model.fit(X)

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

topic_keywords = []

# Print the top 10 words for each topic
for topic_idx, topic in enumerate(lda_model.components_):
    print("Topic %d:" % (topic_idx))
    top_features = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    print(", ".join(top_features))
    top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    topic_keywords.append(', '.join(top_words))

# add topic keywords to dataframe
df['topic_keywords'] = [topic_keywords[i] for i in lda_model.transform(vectorizer.transform(df['cleaned_text'])).argmax(axis=1)]



Topic 0:
postcard, snailmail, postcrossing, post, pillarbox, mail, ii, england, postbox, via
Topic 1:
woman, visit, low, premium, amp, share, rmgl, livelihood, shipment, basic
Topic 2:
eu, law, charged, refund, breaking, vat, record, service, serious, seller
Topic 3:
buy, usps, sell, stock, rmg, share, ftse, trading, investing, amp
Topic 4:
safe, place, secure, post, charles, keep, see, delivery, uk, everyone
Topic 5:
edinburgh, force, self, daily, parcel, postoffice, post, employed, onto, tweet
Topic 6:
detail, account, explain, asking, help, caught, please, parcel, genuine, ebay
Topic 7:
trip, wondering, bus, post, parcel, today, round, mail, im, mile
Topic 8:
state, post, scams, private, money, yep, wondered, finished, year, paid
Topic 9:
Topic 10:
water, energy, amp, barcoded, labour, nhs, rail, privatisation, nationalise, uklabour
Topic 11:
covid19, coronavirus, staff, crisis, test, uk, nhs, people, delivery, post
Topic 12:
ripped, york, get, brother, today, open, like, amp, someo

In [23]:
df

Unnamed: 0,tweet_id,creation_date,text,media_url,video_url,language,favorite_count,retweet_count,reply_count,quote_count,...,user.description,user.external_url,user.number_of_tweets,user.bot,user.timestamp,user.has_nft_avatar,user,detail,cleaned_text,topic_keywords
0,7.485231e+17,Thu Jun 30 14:26:32 +0000 2016,#royalmail offers Compulsory new “Chargeable” ...,,,en,0.0,0.0,0.0,0.0,...,Selro provides all the tools to start and grow...,http://www.selro.com,1036.0,False,1.239193e+09,False,,,offer Compulsory new Chargeable IntegratorCont...,"stamp, philately, postage, postagestamps, stam..."
1,7.484731e+17,Thu Jun 30 11:07:57 +0000 2016,Great #friends realize your #love of all thing...,,,en,1.0,0.0,0.0,0.0,...,,http://our-labour-of-love.blogspot.com/,1973.0,False,1.368150e+09,False,,,Great friend realize love thing written Love g...,"postbox, letter, post, box, letterbox, santa, ..."
2,7.485466e+17,Thu Jun 30 15:59:48 +0000 2016,My #dilevery of #easyskinz with #royalmail #ip...,,,en,1.0,0.0,0.0,0.0,...,instagram: thimothy,,1117.0,False,1.286693e+09,False,,,dilevery easyskinz iphone6 mehappy blackmamba ...,"post, extra, strikes, postalstrike, shopping, ..."
3,7.485004e+17,Thu Jun 30 12:56:24 +0000 2016,#royalmail ridiculous rule not being able to s...,,,en,0.0,0.0,0.0,0.0,...,Consumer rights activist— holding companies to...,,16688.0,False,1.398434e+09,False,,,ridiculous rule able send perfume etc post eve...,"parcel, help, item, delivery, lost, day, deliv..."
4,7.485809e+17,Thu Jun 30 18:16:08 +0000 2016,📮 #RoyalMail MarketReach Report Dispels #Direc...,['https://pbs.twimg.com/media/CmL0Z4qWIAAJDDe....,,en,1.0,0.0,0.0,0.0,...,#UK based #printing company. https://t...,http://www.printmr.co.uk,9769.0,False,1.305716e+09,False,,,MarketReach Report Dispels DirectMail Engageme...,"share, price, low, profit, company, executive,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31541,1.635522e+18,Tue Mar 14 06:02:30 +0000 2023,Good morning! We’ll be open from 7am for all o...,['https://pbs.twimg.com/media/FrKKYKHX0AA16KB....,,en,9.0,3.0,1.0,0.0,...,"Convenience Store & Post Office Selling 🗞, 🍞,🍏...",http://www.facebook.com/boscombeeastpo,10678.0,False,1.349427e+09,False,,,Good morning Well open 7am Post Office need Am...,"office, post, postoffice, local, sorting, queu..."
31542,1.635658e+18,Tue Mar 14 15:02:41 +0000 2023,Royaume-Uni : Royal Mail Émet Pour La Première...,,,fr,0.0,0.0,0.0,0.0,...,Le Matinal est la principale plate-forme média...,https://lematinal.media/,12238.0,False,1.622906e+09,False,,,RoyaumeUni met Pour La Premire Fois Des Timbre...,"worker, postal, pay, condition, band, posties,..."
31543,1.635549e+18,Tue Mar 14 07:51:07 +0000 2023,"Yes, also think trying to find dentist accepti...",,,en,0.0,0.0,0.0,0.0,...,MrStevePassmoor01/08/09 MumOf4&GrandmaTo1 From...,,1818.0,False,1.625620e+09,False,,,Yes also think trying find dentist accepting N...,"fee, amp, charge, custom, pay, refund, lie, co..."
31544,1.635681e+18,Tue Mar 14 16:36:46 +0000 2023,#Postiepics #postie #postman #royalmail #myroy...,['https://pbs.twimg.com/media/FrMbjLHWYAAPNSP....,,qme,0.0,0.0,0.0,0.0,...,Pretty good. Not bad. I can't complain.,https://lagniapperecords.bandcamp.com/,5744.0,False,1.300571e+09,False,,,Postiepics postie postman myround Cornwall Kernow,"postman, postie, cornwall, postiepics, kernow,..."
