In [66]:
import sentiment
import pandas as pd
import nltk
import pickle
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
print(sentiment.df.head(20))

                                        author  \
0                                   Al Jazeera   
1                                   Al Jazeera   
2                               Yashraj Sharma   
3                                         None   
4                             Al Jazeera Staff   
5                                   Al Jazeera   
6                               Yashraj Sharma   
7                             Ishadrita Lahiri   
8                                   Al Jazeera   
9                                         None   
10                                      ET Now   
11                          Sudha Ramachandran   
12                                          RT   
13  Fazleena Aziz, TAN SIN CHOW, R. ARAVINTHAN   
14                                        None   
15                                        None   
16                                        None   
17            Auqib Javeed - Srinagar, Kashmir   
18             Saurabh Sharma and Shivam Patel   


In [69]:
df = sentiment.df

In [70]:
tfdif = TfidfVectorizer(stop_words='english')
df['content'] = df['content'].fillna('')
tfidf_matrix = tfdif.fit_transform(df['content'])

In [71]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [72]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
print(indices.head())

title
Indian government agency spent millions to promote BJP election slogans      0
Photos: BJP and opposition supporters celebrate India election results       1
Modi magic: Why Indian exit polls predict record BJP win                     2
India’s Modi wins election, but BJP suffers setback                          3
India election 2024: Why isn’t Modi’s BJP fielding candidates in Kashmir?    4
dtype: int64


In [73]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    news_indices = [i[0] for i in sim_scores]
    print(sim_scores)
    return df['title'].iloc[news_indices]

In [74]:
get_recommendations('India’s Modi wins election, but BJP suffers setback')


[(5, 0.27926115750228436), (51, 0.24518877032478648), (6, 0.22200051110166974), (1, 0.1896929081189981), (2, 0.1768741401063847), (53, 0.16783842360033438), (21, 0.15839324656041506), (12, 0.15038642126184995), (17, 0.1463729627590667), (0, 0.13779264535527272)]


5     India’s exit polls predict a big majority for ...
55    India election: how Narendra Modi’s BJP uses a...
6     Modi’s BJP to lose majority in India election ...
1     Photos: BJP and opposition supporters celebrat...
2     Modi magic: Why Indian exit polls predict reco...
59      NDA gets a third term but BJP short of majority
21    Indian Opposition Celebrates as Modi's BJP Fac...
12    Modi claims victory as BJP-led alliance secure...
17    Kashmir: Why Modi's BJP is not fighting electi...
0     Indian government agency spent millions to pro...
Name: title, dtype: object

In [52]:
df['tags'] = df['content'].copy()

In [53]:
ps = PorterStemmer()

In [54]:
def stems(text):
    l = []
    for i in text.split():
        l.append(ps.stem(i))

    return ' '.join(l)

In [55]:
df['tags'] = df['tags'].apply(stems)

In [56]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [57]:
vector = cv.fit_transform(df['tags']).toarray()

In [58]:
similartiy = cosine_similarity(vector)

In [59]:
similartiy.shape

(94, 94)

In [60]:
df[df['title'] == 'India election 2024: Why isn’t Modi’s BJP fielding candidates in Kashmir?'].index[0]

4

In [61]:
def recommend(news):
    index = df[df['title'] == news].index[0]
    distances = sorted(list(enumerate(similartiy[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)

In [62]:
recommend('India’s Modi wins election, but BJP suffers setback')

India’s exit polls predict a big majority for Modi’s BJP-led alliance
Modi’s BJP to lose majority in India election shock, needs allies for gov’t
Indian Opposition Celebrates as Modi's BJP Faces Shock Election Setback
India election: how Narendra Modi’s BJP uses and abuses religious minorities for political purposes
Photos: BJP and opposition supporters celebrate India election results


In [63]:
pickle.dump(df, open('news.pkl', 'wb'))
pickle.dump(similartiy, open('similarity.pkl', 'wb'))