<a href="https://colab.research.google.com/github/likeshd/datascience_case_study/blob/main/text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
nlp = spacy.load('en_core_web_sm')


In [11]:
text = pd.read_csv("/content/articles.csv", encoding='latin-1')
print(text.head(5))

                                             Article  \
0  Data analysis is the process of inspecting and...   
1  The performance of a machine learning algorith...   
2  You must have seen the news divided into categ...   
3  When there are only two classes in a classific...   
4  The Multinomial Naive Bayes is one of the vari...   

                                               Title  
0                  Best Books to Learn Data Analysis  
1         Assumptions of Machine Learning Algorithms  
2          News Classification with Machine Learning  
3  Multiclass Classification Algorithms in Machin...  
4        Multinomial Naive Bayes in Machine Learning  


In [5]:
text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  34 non-null     object
 1   Title    34 non-null     object
dtypes: object(2)
memory usage: 672.0+ bytes


In [6]:
# Combine all titles into a single string
titles_text = ' '.join(text['Title'])
titles_text


'Best Books to Learn Data Analysis Assumptions of Machine Learning Algorithms News Classification with Machine Learning Multiclass Classification Algorithms in Machine Learning Multinomial Naive Bayes in Machine Learning News Classification with Machine Learning Best Books to Learn NLP Send Instagram Messages using Python Pfizer Vaccine Sentiment Analysis using Python Squid Game Sentiment Analysis using Python Best Books to Learn Computer Vision Best Resources to Learn Python Best Python Frameworks to Build APIs Voice Recorder using Python Language Detection with Machine Learning Multilayer Perceptron in Machine Learning Types of Neural Networks Clustering Algorithms in Machine Learning For Loop Over Keys and Values in a Python Dictionary Health Insurance Premium Prediction with Machine Learning Mean Shift Clustering in Machine Learning BIRCH Clustering in Machine Learning Agglomerative Clustering in Machine Learning DBSCAN Clustering in Machine Learning K-Means Clustering in Machine L

In [9]:
# Create a WordCloud object
wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(titles_text)
wordcloud


<wordcloud.wordcloud.WordCloud at 0x784237df6020>

In [10]:
# Plot the Word Cloud
fig = px.imshow(wordcloud, title='Word Cloud of Titles')
fig.update_layout(showlegend=False)
fig.show()

In [13]:
# Sentiment Analysis
text['Sentiment'] = text['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)
text

Unnamed: 0,Article,Title,Sentiment
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis,0.666667
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms,0.020833
2,You must have seen the news divided into categ...,News Classification with Machine Learning,0.6
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...,0.625
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning,-0.101429
5,You must have seen the news divided into categ...,News Classification with Machine Learning,0.6
6,Natural language processing or NLP is a subfie...,Best Books to Learn NLP,0.283333
7,By using a third-party application or API to m...,Send Instagram Messages using Python,0.05
8,Twitter is one of the most popular social medi...,Pfizer Vaccine Sentiment Analysis using Python,0.406667
9,The squid game is currently one of the most tr...,Squid Game Sentiment Analysis using Python,-0.108333


In [15]:
# Sentiment Distribution
fig = px.histogram(text, x='Sentiment', title='Sentiment Distribution')
fig.show()

In [16]:
# NER
def extract_named_entities(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)


In [19]:
text['Named_Entities'] = text['Article'].apply(extract_named_entities)
text['Named_Entities'].head(10)


0                                  {'DATE': ['today']}
1                                                   {}
2                                                   {}
3          {'CARDINAL': ['only two', 'more than two']}
4    {'ORG': ['The Multinomial Naive Bayes', 'Naive...
5                                                   {}
6    {'ORG': ['NLP', 'NLP', 'NLP', 'NLP'], 'CARDINA...
7    {'ORDINAL': ['third'], 'ORG': ['API', 'Instagr...
8    {'PRODUCT': ['Twitter', 'Twitter'], 'CARDINAL'...
9            {'ORG': ['NetFlix'], 'CARDINAL': ['One']}
Name: Named_Entities, dtype: object

In [20]:
# Visualize NER
entity_counts = Counter(entity for entities in text['Named_Entities'] for entity in entities)
entity_df = pd.DataFrame.from_dict(entity_counts, orient='index').reset_index()
entity_df.columns = ['Entity', 'Count']

fig = px.bar(entity_df.head(10), x='Entity', y='Count', title='Top 10 Named Entities')
fig.show()

In [22]:
# Topic Modeling
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(text['Article'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tf)


In [24]:
# Visualize topics
topic_names = ["Topic " + str(i) for i in range(lda_model.n_components)]
text['Dominant_Topic'] = [topic_names[i] for i in lda_topic_matrix.argmax(axis=1)]
text['Dominant_Topic']


0     Topic 1
1     Topic 0
2     Topic 1
3     Topic 3
4     Topic 4
5     Topic 1
6     Topic 0
7     Topic 1
8     Topic 2
9     Topic 2
10    Topic 1
11    Topic 1
12    Topic 1
13    Topic 1
14    Topic 1
15    Topic 1
16    Topic 1
17    Topic 4
18    Topic 1
19    Topic 2
20    Topic 3
21    Topic 3
22    Topic 3
23    Topic 3
24    Topic 3
25    Topic 1
26    Topic 1
27    Topic 1
28    Topic 0
29    Topic 0
30    Topic 1
31    Topic 1
32    Topic 4
33    Topic 1
Name: Dominant_Topic, dtype: object

In [26]:
fig = px.bar(text['Dominant_Topic'].value_counts().reset_index(), x='index', y='Dominant_Topic', title='Topic Distribution')
fig.show()