In [1]:
import pandas as pd
import re
from collections import Counter

# assuming df is your DataFrame
def most_common_words(df, category_col, text_col, top_n=25):
    # Group by category and apply word count
    word_counts = df.groupby(category_col)[text_col].apply(
        lambda x: ' '.join(x).lower()
    ).reset_index()

    # Get top N words for each category
    top_words = []
    for index, row in word_counts.iterrows():
        category = row[category_col]
        text = row[text_col]
        words = re.findall(r'\b\w+\b', text)
        counter = Counter(words)
        top_n_words = counter.most_common(top_n)
        top_words.append((category, top_n_words))

    return top_words

# example usage
df = pd.read_csv("data_4.csv")

result = most_common_words(df, 'category', 'clean_text')
for category, words in result:
    print(f"Category: {category}")
    for word, count in words:
        print(f"{word}: {count}")
    print()

Category: arts
said: 537
new: 400
one: 390
also: 342
time: 328
like: 314
music: 280
year: 278
people: 278
june: 235
first: 224
day: 213
show: 206
series: 205
park: 183
many: 172
two: 166
best: 165
million: 161
would: 158
event: 158
back: 157
work: 154
state: 151
way: 150

Category: crime
said: 947
police: 626
court: 393
june: 347
also: 302
year: 300
time: 292
people: 276
law: 254
new: 243
one: 241
state: 232
two: 218
case: 197
would: 190
last: 188
county: 175
day: 174
old: 170
found: 170
may: 165
hearing: 160
death: 159
federal: 151
three: 150

Category: disaster
said: 903
fire: 584
water: 376
people: 300
year: 260
new: 240
two: 240
also: 235
one: 232
june: 225
time: 223
police: 223
air: 217
state: 214
day: 192
rescue: 177
would: 174
three: 172
city: 167
may: 166
area: 161
county: 161
first: 154
home: 144
near: 141

Category: economy
said: 497
year: 340
new: 325
company: 306
may: 264
also: 259
june: 259
per: 241
would: 221
market: 220
million: 219
time: 217
one: 211
people: 184
price: 

In [2]:
removeWords = ['said', 'june', 'year', 'new', 'also', 'one', 'day', 'would', 'first', 'time', 'like', 'people', 'two', 'last', 'three', 'many', 'need']
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in removeWords]))
result = most_common_words(df, 'category', 'clean_text')
for category, words in result:
    print(f"Category: {category}")
    for word, count in words:
        print(f"{word}: {count}")
    print()

Category: arts
music: 280
show: 206
series: 205
park: 183
best: 165
million: 161
event: 158
back: 157
work: 154
state: 151
way: 150
may: 149
get: 144
group: 140
take: 138
world: 135
community: 132
season: 128
good: 126
school: 124
made: 124
life: 123
road: 122
rock: 122
even: 119

Category: crime
police: 626
court: 393
law: 254
state: 232
case: 197
county: 175
old: 170
found: 170
may: 165
hearing: 160
death: 159
federal: 151
prison: 147
told: 142
justice: 141
man: 141
right: 136
officer: 135
according: 133
president: 133
city: 133
could: 123
crime: 122
government: 120
home: 119

Category: disaster
fire: 584
water: 376
police: 223
air: 217
state: 214
rescue: 177
city: 167
may: 166
area: 161
county: 161
home: 144
near: 141
according: 132
news: 129
around: 127
could: 125
back: 123
national: 123
help: 122
road: 122
get: 122
old: 121
department: 121
flight: 117
service: 117

Category: economy
company: 306
may: 264
per: 241
market: 220
million: 219
price: 170
tax: 146
billion: 144
business: 

In [31]:
for category, words in result:
    print(f"'{category}':[")
    for word, count in words:
        print(f"'{word}',", end='')
    print(']')

'arts':[
'music','show','series','park','best','million','event','back','work','state','way','may','get','group','take','world','community','season','good','school',]
'crime':[
'police','court','law','state','case','county','old','found','may','hearing','death','federal','prison','told','justice','man','right','officer','according','president',]
'disaster':[
'fire','water','police','air','state','rescue','city','may','area','county','home','near','according','news','around','could','back','national','help','road',]
'economy':[
'company','may','per','market','million','price','tax','billion','business','government','state','since','industry','group','percent','use','get','made','work','according',]
'education':[
'school','university','education','high','learning','college','state','student','class','community','work','program','support','help','board','government','president','covid','public','national',]
'environmental':[
'energy','park','waste','water','power','market','state','compan

In [32]:
checkWords = {
    'arts':[
'music','show','series','park','best','million','event','back','work','state','way','may','get','group','take','world','community','season','good','school',],
'crime':[
'police','court','law','state','case','county','old','found','may','hearing','death','federal','prison','told','justice','man','right','officer','according','president',],
'disaster':[
'fire','water','police','air','state','rescue','city','may','area','county','home','near','according','news','around','could','back','national','help','road',],
'economy':[
'company','may','per','market','million','price','tax','billion','business','government','state','since','industry','group','percent','use','get','made','work','according',],
'education':[
'school','university','education','high','learning','college','state','student','class','community','work','program','support','help','board','government','president','covid','public','national',],
'environmental':[
'energy','park','waste','water','power','market','state','company','north','management','solar','group','renewable','county','world','wildlife','city','wind','river','industry',],
'health':[
'health','covid','care','market','medical','pandemic','hospital','state','may','vaccine','public','food','disease','get','help','company','global','well','virus','work',],
'humaninterest':[
'award','size','get','act','home','back','winning','best','statement','texas','team','may','good','world','well','ownership','beneficial','plant','second','top',],
'labour':[
'work','health','security','working','pandemic','retirement','may','covid','could','get','state','make','job','business','home','help','social','market','city','week',],
'lifestyle':[
'home','travel','get','beauty','bridge','make','summer','may','way','life','work','back','million','come','food','covid','want','pandemic','best','business',],
'other':[
'state','market','game','covid','government','may','could','rub','wednesday','home','high','city','get','school','pandemic','health','make','season','back','team',],
'politics':[
'government','state','president','minister','data','security','public','city','international','support','tuesday','news','united','may','covid','national','pandemic','help','house','health',],
'religion':[
'church','life','school','family','may','community','world','way','come','work','home','covid','old','get','back','spiritual','god','state','well','city',],
'science':[
'market','research','space','report','global','data','work','company','study','industry','may','science','medical','could','high','design','engineering','covid','well','use',],
'social':[
'health','care','social','work','government','covid','may','baby','state','abortion','support','home','get','life','pandemic','according','community','make','city','help',],
'sport':[
'team','game','season','back','world','get','match','second','sport','four','league','win','best','final','play','tournament','coach','right','open','championship',],
'unrest':[
'israel','police','war','government','city','violence','shooting','israeli','military','may','wednesday','state','gun','country','president','old','come','security','world','home',],
'weather':[
'climate','change','heat','weather','water','high','carbon','could','global','national','state','may','risk','area','service','according','world','friday','rain','report',],
}

def check_word_match(row):
    category = row['category']
    words = set(row['text'].lower().split())  # Convert text to lowercase and split into words
    
    if category in checkWords:
        match_count = sum(1 for word in checkWords[category] if word in words)  # Count matches
        return match_count >= 3
    
    return False  # If category is not in the mapping, return False

# Apply function to create new column
df['matches'] = df.apply(check_word_match, axis=1)
df.to_csv('ethanCleaned.csv')