# Clustering News Articles 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [2]:
df=pd.read_excel("Dataset/Data_Train.xlsx")

In [3]:
df.STORY=df.STORY.apply(lambda x:re.sub(r"[^A-Za-z]"," ",x))

In [4]:
def clean_data(row):
    tokens=row.split()
    word_list=[]
    for x in tokens:
        x=porter.stem(x.lower())
        if (x not in stop_words) and len(x)>2:
            word_list.append(x.lower())
    return (" ".join(word_list))   

In [5]:
df['story']=df.STORY.apply(lambda x:clean_data(x))

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

tf = vectorizer.fit_transform(df['story']).toarray()

tf_feature_names = vectorizer.get_feature_names()

In [7]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 4

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [8]:
model.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [9]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Document %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [10]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Document 0 words,Document 1 words,Document 2 words,Document 3 words
0,year,thi,smartphon,parti
1,market,said,phone,bjp
2,said,user,camera,elect
3,thi,use,appl,congress
4,india,film,samsung,said
5,price,also,pro,state
6,compani,like,display,seat
7,trade,app,iphon,modi
8,bank,new,devic,minist
9,growth,one,launch,poll


# Conclusion

By analyzing top 10 words of each document..  
Document0:- **technology related**   
Document1:- **Politics**  
Document2:- **Mobile Phones**  
Document3:- **Entertainment** (like movie..)  