### Import Libraries

In [20]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from summa import summarizer

### Import News Dataset

In [2]:
news = pd.read_json("News.json", lines = True)
news.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


### Text Cleaning

In [5]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove whitespace
    text = text.strip()
    
    return text

# Read the JSON file into a DataFrame
df = pd.read_json('News.json', lines = True)

# Exclude timestamp and link columns from text cleaning
exclude_columns = ['date', 'link']

# Clean text in all columns except the excluded columns
for column in df.columns:
    if column not in exclude_columns:
        df[column] = df[column].apply(clean_text)

df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,over 4 million americans roll up sleeves for o...,us news,health experts said it is too early to predict...,carla k johnson ap,2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,american airlines flyer charged banned for lif...,us news,he was subdued by passengers and crew when he ...,mary papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 of the funniest tweets about cats and dogs ...,comedy,until you have a dog you dont understand what ...,elyse wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,the funniest tweets from parents this week sep...,parenting,accidentally put grownup toothpaste on my todd...,caroline bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,woman who called cops on black birdwatcher los...,us news,amy cooper accused investment firm franklin te...,nina golgowski,2022-09-22


**1. Topic Modeling**

In [6]:
import pandas as pd

def categorize_intent(text, keyword_dict):
    for intent, keywords in keyword_dict.items():
        for keyword in keywords:
            if keyword in text:
                return intent
    return "General"

def categorize_texts(df, text_column, keyword_dict):
    intents = []
    for index, row in df.iterrows():
        text = row[text_column]
        intent = categorize_intent(text, keyword_dict)
        intents.append(intent)
    df['intent'] = intents
    return df

In [7]:
keyword_dict = {
    'Informational': ['directions', 'recipe', 'weather', 'information', 'facts', 'knowledge', 'news', 'research', 'history', 'definition', 'tutorial', 'guide', 'data', 'statistics', 'FAQ'],
    'Transactional': ['order', 'book', 'purchase', 'buy', 'reserve', 'subscribe', 'checkout', 'payment', 'delivery', 'refund', 'membership', 'upgrade', 'offer', 'coupon', 'voucher'],
    'Social': ['greeting', 'conversation', 'gratitude', 'sympathy', 'socialize', 'meet', 'chat', 'compliment', 'apologize', 'celebrate', 'small talk', 'networking', 'reconnect', 'introduce', 'interact'],
    'Navigational': ['find', 'locate', 'nearest', 'navigate', 'map', 'address', 'directions', 'GPS', 'route', 'wayfinding', 'landmark', 'traffic', 'distance', 'transportation', 'public transport'],
    'Problem-Solving': ['troubleshoot', 'solve', 'fix', 'issue', 'help', 'advice', 'error', 'bug', 'debug', 'trick', 'resolve', 'assistance', 'support', 'workaround', 'diagnose'],
    'Entertainment': ['game', 'joke', 'entertainment', 'movie', 'music', 'book', 'fun', 'play', 'puzzle', 'riddle', 'comedy', 'concert', 'novel', 'party', 'recreation'],
    'Persuasive': ['persuade', 'convince', 'support', 'promote', 'opinion', 'argument', 'debate', 'campaign', 'propose', 'endorse', 'advocate', 'conviction', 'influence', 'persuasion', 'testimonial'],
    'Emotional': ['emotion', 'feelings', 'joy', 'sadness', 'anger', 'comfort', 'support', 'happiness', 'love', 'fear', 'anxiety', 'empathy', 'encouragement', 'motivation', 'inspiration'],
    'Instructional': ['instruction', 'guide', 'assemble', 'follow', 'operate', 'tutorial', 'step-by-step', 'demonstration', 'procedure', 'training', 'walkthrough', 'demo', 'manual', 'illustration', 'technique'],
    'Collaborative': ['plan', 'coordinate', 'collaborate', 'teamwork', 'event', 'brainstorm', 'meeting', 'project', 'task', 'cooperation', 'group', 'partnership', 'contribution', 'network', 'synergy'],
    'Health': ['health', 'fitness', 'nutrition', 'exercise', 'wellness', 'diet', 'healthy habits', 'well-being', 'lifestyle', 'mental health', 'weight loss', 'medical', 'self-care', 'stress management', 'prevention'],
    'Technology': ['technology', 'device', 'software', 'hardware', 'internet', 'network', 'programming', 'gadget', 'innovation', 'digital', 'app', 'smartphone', 'cybersecurity', 'AI', 'data privacy'],
    'Finance': ['finance', 'money', 'investment', 'savings', 'budget', 'credit', 'banking', 'stocks', 'taxes', 'insurance', 'retirement', 'loan', 'debt', 'financial planning', 'wealth'],
    'Travel': ['passengers', 'travel', 'vacation', 'destination', 'hotel', 'flight', 'tourist', 'adventure', 'itinerary', 'sightseeing', 'explore', 'cultural', 'passport', 'visa', 'packing', 'local cuisine'],
    'Education': ['teacher', 'education', 'learning', 'school', 'university', 'study', 'research', 'knowledge', 'curriculum', 'exam', 'homework', 'online courses', 'degree', 'learning resources', 'tutoring', 'academic'],
    'Sports': ['sports', 'game', 'team', 'athlete', 'tournament', 'score', 'football', 'basketball', 'soccer', 'baseball', 'tennis', 'fitness', 'sportsmanship', 'coach', 'training'],
    'Fashion': ['fashion', 'clothing', 'style', 'trend', 'design', 'outfit', 'fashion show', 'accessories', 'hairstyle', 'makeup', 'wardrobe', 'shopping', 'brand', 'fashion industry', 'model'],
    'Food': ['food', 'restaurant', 'cuisine', 'recipe', 'cook', 'ingredient', 'cooking techniques', 'nutrition', 'food preparation', 'meal planning', 'healthy recipes', 'restaurant review', 'food blog', 'culinary', 'grocery'],
    'Politics': ['politics', 'government', 'policy', 'election', 'democracy', 'law', 'political news', 'current affairs', 'voting', 'political party', 'public policy', 'campaign', 'political debate', 'legislation', 'citizenship'],
    'Science': ['science', 'research', 'experiment', 'discovery', 'theory', 'scientist', 'biology', 'chemistry', 'physics', 'astronomy', 'environmental science', 'scientific method', 'lab', 'scientific journal', 'scientific inquiry'],
    'Relationships': ['relationship', 'love', 'friendship', 'dating', 'marriage', 'breakup', 'communication', 'trust', 'compatibility', 'intimacy', 'conflict resolution', 'relationship advice', 'relationship goals', 'relationship dynamics', 'relationship counseling'],
    'Art': ['art', 'painting', 'sculpture', 'gallery', 'creativity', 'expression', 'artistic techniques', 'art history', 'art exhibition', 'visual arts', 'modern art', 'art movements', 'art critique', 'artistic inspiration', 'art appreciation'],
    'Motivation': ['motivation', 'inspire', 'goal', 'ambition', 'perseverance', 'success', 'self-improvement', 'personal growth', 'achieving dreams', 'motivational quotes', 'positive mindset', 'overcoming obstacles', 'goal setting', 'self-discipline', 'empowerment'],
    'Environment': ['environment', 'sustainability', 'climate change', 'recycling', 'pollution', 'nature', 'green living', 'renewable energy', 'conservation', 'eco-friendly', 'carbon footprint', 'environmental activism', 'ecosystem', 'biodiversity', 'environmental impact'],
    'Technology News': ['tech news', 'innovation', 'gadget', 'start-up', 'software update', 'technology trends', 'tech reviews', 'tech events', 'latest gadgets', 'emerging technologies', 'tech industry news', 'future tech', 'AI advancements', 'tech controversies', 'digital transformation'],
    'Self-Care': ['self-care', 'relaxation', 'meditation', 'mindfulness', 'stress relief', 'self-care practices', 'self-love', 'self-reflection', 'self-care routines', 'self-care activities', 'wellness tips', 'mental health', 'self-care for busy schedules', 'self-care habits', 'self-care products'],
    'Career': ['career', 'job', 'interview', 'promotion', 'professional development', 'job search', 'resume writing', 'career growth', 'workplace skills', 'networking', 'career transitions', 'job satisfaction', 'career advancement', 'leadership', 'work-life balance'],
    'Parenting': ['parenting', 'child', 'baby', 'education', 'discipline', 'family', 'parenting tips', 'child development', 'parenting styles', 'positive parenting', 'parent-child relationships', 'raising children', 'parenting challenges', 'parenting support', 'parenting resources'],
    'Relationship Advice': ['relationship advice', 'communication', 'trust', 'conflict resolution', 'commitment', 'building healthy relationships', 'maintaining long-term relationships', 'relationship red flags', 'relationship boundaries', 'love languages', 'relationship milestones', 'relationship counseling', 'relationship goals', 'relationship problems', 'healthy communication'],
    'DIY Projects': ['DIY projects', 'crafts', 'home improvement', 'repairs', 'DIY home decor', 'DIY tutorials', 'upcycling', 'DIY woodworking', 'DIY gifts', 'DIY fashion', 'DIY beauty', 'DIY organization', 'DIY hacks', 'DIY renovation', 'DIY gardening'],
    'Animals': ['animals', 'pets', 'wildlife', 'domestic animals', 'animal care', 'animal behavior', 'endangered species', 'animal facts', 'animal habitats', 'animal conservation', 'animal welfare', 'animal adoption', 'animal training', 'animal rights', 'animal communication']
}


df = categorize_texts(df, 'short_description', keyword_dict)

In [9]:
df["intent"].value_counts()

General                96354
Informational          10020
Entertainment           9832
Problem-Solving         9120
Emotional               8835
Transactional           7854
Technology              7538
Art                     6814
Collaborative           6074
Navigational            4804
Fashion                 3790
Social                  3758
Health                  3737
Politics                3603
Persuasive              3511
Education               3398
Instructional           3305
Parenting               3133
Travel                  2706
Finance                 1999
Food                    1797
Relationships           1611
Sports                  1534
Science                 1249
Career                  1203
Motivation               843
Environment              657
Animals                  218
Self-Care                138
Relationship Advice       67
DIY Projects              24
Technology News            1
Name: intent, dtype: int64

**2. Topic Classification**

In [15]:
X = df["short_description"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify = y)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = LinearSVC()
classifier.fit(X_train_vec, y_train)

accuracy = classifier.score(X_test_vec, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.4427766906886842


**3. Text Summarization**

In [33]:
# Text
text = "There are many techniques available to generate extractive summarization to keep it simple, I will be using an unsupervised learning approach to find the sentences similarity and rank them. Summarization can be defined as a task of producing a concise and fluent summary while preserving key information and overall meaning. One benefit of this will be, you don’t need to train and build a model prior start using it for your project. It’s good to understand Cosine similarity to make the best use of the code you are going to see. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. Its measures cosine of the angle between vectors. The angle will be 0 if sentences are similar."
print(text)

There are many techniques available to generate extractive summarization to keep it simple, I will be using an unsupervised learning approach to find the sentences similarity and rank them. Summarization can be defined as a task of producing a concise and fluent summary while preserving key information and overall meaning. One benefit of this will be, you don’t need to train and build a model prior start using it for your project. It’s good to understand Cosine similarity to make the best use of the code you are going to see. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. Its measures cosine of the angle between vectors. The angle will be 0 if sentences are similar.


In [34]:
# Extractive summarization
summary = summarizer.summarize(text)
print(summary)

Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.


**4. Machine Translation**

In [40]:
def translate_text(text, target_language):
    translator = Translator(to_lang=target_language)
    translation = translator.translate(text)
    return translation

text = "I like to do natural language processing using python programming."
target_language = "ar"  # Translate to Arabic
translated_text = translate_text(text, target_language)
print(translated_text)

أحب القيام بمعالجة اللغة الطبيعية باستخدام برمجة بايثون.
