In [1]:
import pandas as pd
import numpy as np 
from collections import OrderedDict
pd.set_option('display.max_colwidth', 400)
import re
rs=10

from anytree import Node, RenderTree, PreOrderIter
from anytree.exporter import DotExporter, DictExporter

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

from tqdm import tqdm
import yake

# Read Clean Data

In [15]:
df = pd.read_csv('../data/individual_news_2020.csv')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226757 entries, 0 to 226756
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   News        226756 non-null  object
 1   clean_news  226565 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


In [17]:
df.dropna(inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226565 entries, 0 to 226756
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   News        226565 non-null  object
 1   clean_news  226565 non-null  object
dtypes: object(2)
memory usage: 5.2+ MB


In [19]:
df.reset_index(inplace=True)

# Embed Data

In [20]:
df.head()

Unnamed: 0,index,News,clean_news
0,0,Shivin Narang injures his hand on the set of his show,shivin narang injures his hand the set his show
1,1,'Allergy cases on the rise in Bengaluru,'allergy cases the rise bengaluru
2,2,'A grand Hanukkah celebration held in the city,grand hanukkah celebration held the city
3,3,'I respect my competitors,respect competitors
4,4,because they bring out the best in me: Yash,because they bring out the best me: yash


In [21]:
df['clean_news'].sample(5)

218063                       "'bihar shows improvement child health'"
121893    'kanpur: revenue department probes land grabbed vikas dubey
199497                                               robbed right too
105957                     'case surge prompts three-day shop closure
197688                             'cheats duped people small amounts
Name: clean_news, dtype: object

In [22]:
data = df.sample(10000)

In [23]:
data.shape

(10000, 3)

In [24]:
data.sample(5)

Unnamed: 0,index,News,clean_news
111000,111114,'Chandigarh in favour of flyover on Sector 29-31 rotary,'chandigarh favour flyover sector 29-31 rotary
192466,192631,roads dry,roads dry
157568,157706,'Prepare anganwadi centres as pre-primary schools: Andhra Pradesh CM,'prepare anganwadi centres pre-primary schools: andhra pradesh
210690,210869,'Raga Blues: What’s missing this Margazhi season?,'raga blues: what’s missing this margazhi season?
96714,96820,kin not convinced,kin not convinced


In [25]:
data.reset_index(inplace=True)

In [26]:
data.to_csv('../data/news_sample_10000.csv', index=False)

# BOW

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def count_vect(field,train_data):
    cv = CountVectorizer(min_df=5, ngram_range=(1,2), max_features=1000)
    count_vect = cv.fit_transform(train_data[field].values)
    
    train_feature_set=cv.transform(train_data[field].values)
    
    return train_feature_set, cv, count_vect

In [32]:
X_train_cov, vec1, transformer1 = count_vect('clean_news',data)

In [33]:
X_train_cov.shape

(10000, 1000)

In [34]:
bow = X_train_cov.toarray()

# TFIDF

In [35]:
def tfidf_vect(field, train_data):
    tfidf_vectorizer=TfidfVectorizer(min_df=5, ngram_range=(1,2), max_features=1000)
    tf_idf = tfidf_vectorizer.fit_transform(train_data[field].values)
        
    train_feature_set=tfidf_vectorizer.transform(train_data[field].values)
        
    return train_feature_set,tfidf_vectorizer, tf_idf

In [36]:
X_train_tf, vec2, transformer = tfidf_vect('clean_news',data)

In [37]:
X_train_tf.shape

(10000, 1000)

In [38]:
tfidf = X_train_tf.toarray()

# Sentence Transformers

In [27]:
from sentence_transformers import SentenceTransformer
t = SentenceTransformer('/Users/keerthana.s/projects/common/transformers/roberta-base-nli-stsb-mean-tokens')

In [28]:
embeddings = t.encode(data['clean_news'].values, show_progress_bar=True, num_workers=4, batch_size=500)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=20.0, style=ProgressStyle(description_width…




In [39]:
features = np.concatenate((bow, tfidf, embeddings),axis=1)

In [40]:
features.shape

(10000, 2768)

# Scaling

In [41]:
from sklearn.preprocessing import StandardScaler
def get_normalised_data(data):
    scaler = StandardScaler(with_mean=False)
    return scaler.fit_transform(data)

In [43]:
scaled_data = get_normalised_data(features)

In [44]:
scaled_data.shape

(10000, 2768)

# Dimenstionality reduction

In [45]:
import umap.umap_ as umap
d = umap.UMAP(random_state=rs, min_dist=0.1, n_neighbors=15, n_components=500).fit(scaled_data)

In [46]:
embedding_df = pd.DataFrame(d.embedding_)

In [47]:
embedding_df.shape

(10000, 500)

In [53]:
embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,9.967623,9.56259,9.891655,9.934966,9.858666,9.870971,9.33578,9.880396,9.76696,9.126382,...,5.258048,3.887737,7.38792,6.239135,3.592102,4.60873,4.412725,3.605367,3.166441,6.710532
1,9.960107,9.632978,9.879575,9.920382,9.865302,9.777955,9.549959,9.963574,9.764003,9.117813,...,5.293014,3.880162,7.345045,6.25039,3.578063,4.6484,4.329896,3.655612,3.170959,6.692613
2,9.965224,9.586465,9.901536,9.956074,9.860868,9.819626,9.237768,9.844108,9.769893,9.151678,...,5.271272,3.927061,7.362682,6.233048,3.630614,4.637338,4.333327,3.594735,3.139993,6.703682
3,9.953863,9.604853,9.891928,9.885715,9.866065,9.880277,9.356936,9.914603,9.903923,9.121755,...,5.267444,3.855801,7.397842,6.24277,3.586447,4.633599,4.421291,3.657853,3.189579,6.725579
4,9.964613,9.583738,9.871429,9.93675,9.831121,9.872887,9.395439,9.851462,9.919547,9.153717,...,5.276075,3.864719,7.38746,6.274598,3.547915,4.542058,4.387795,3.560054,3.140079,6.727359


In [54]:
embedding_df.to_csv('../data/transformed_data_10000.csv', index=False)