# Clustering with Features Selection

In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

from sklearn import preprocessing

from sklearn.metrics import silhouette_score
from kneed import KneeLocator

from scipy.interpolate import make_interp_spline, BSpline

from sklearn.decomposition import PCA

import math

from sklearn.feature_selection import SelectKBest

## Reading data

In [2]:
with open ("gmw_intersection.json", "r", encoding = "utf-8") as infile:
    gmw_intersection = json.load(infile)

In [3]:
with open("ns_subreddit_2021-01.json", "r", encoding = "utf-8") as infile1:
    ns_subreddit_jan = json.load(infile1)

with open("ns_subreddit_2021-02.json", "r", encoding = "utf-8") as infile2:
    ns_subreddit_feb = json.load(infile2)
    
with open("ns_subreddit_2021-03.json", "r", encoding = "utf-8") as infile3:
    ns_subreddit_march = json.load(infile3)

with open("ns_subreddit_2021-04.json", "r", encoding = "utf-8") as infile4:
    ns_subreddit_april = json.load(infile4)
    
with open("ns_subreddit_2021-05.json", "r", encoding = "utf-8") as infile5:
    ns_subreddit_may = json.load(infile5)
    
with open("ns_subreddit_2021-02.json", "r", encoding = "utf-8") as infile6:
    ns_subreddit_jun = json.load(infile6)

In [4]:
# Reading in all the subreddits in our files

with open("subreddit_ns_2021-01.json", "r", encoding = "utf-8") as infile1a:
    subreddit_ns_jan = json.load(infile1a)

with open("subreddit_ns_2021-02.json", "r", encoding = "utf-8") as infile2a:
    subreddit_ns_feb = json.load(infile2a)
    
with open("subreddit_ns_2021-03.json", "r", encoding = "utf-8") as infile3a:
    subreddit_ns_march = json.load(infile3a)
    
with open("subreddit_ns_2021-04.json", "r", encoding = "utf-8") as infile4a:
    subreddit_ns_april = json.load(infile4a)
    
with open("subreddit_ns_2021-05.json", "r", encoding = "utf-8") as infile5a:
    subreddit_ns_may = json.load(infile5a)
    
with open("subreddit_ns_2021-02.json", "r", encoding = "utf-8") as infile6a:
    subreddit_ns_jun = json.load(infile6a)

Now, let's set up a fixed ordering for the subreddits as features

In [7]:
subreddit_intersection = list(set(subreddit_ns_jan) & set(subreddit_ns_feb) &
      set(subreddit_ns_march) & set(subreddit_ns_april) &
      set(subreddit_ns_may) & set(subreddit_ns_jun))

In [8]:
len(subreddit_intersection)

6220

Let's create the vector representations for each news source.

In [11]:
ns_sr_rep = dict()
for ns in ns_subreddit_april:
    # the representation of a news source = # mentions in each subreddit
    ns_subreddit_april[ns] = Counter(ns_subreddit_april[ns])
    # making sure the ordering follows `subreddit_intersection`
    ns_sr_rep[ns] = [ns_subreddit_april[ns][subreddit_intersection[idx]] for idx in range(len(subreddit_intersection))]

# added Dec 12

# january
for ns_jan in ns_subreddit_jan:
    ns_subreddit_jan[ns_jan] = Counter(ns_subreddit_jan[ns_jan])
    if ns_jan in ns_sr_rep:
        # update
        ns_sr_rep[ns_jan] = [ns_sr_rep[ns_jan][idx] + ns_subreddit_jan[ns_jan][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]   
    else:
        ns_sr_rep[ns_jan] = [ns_subreddit_jan[ns_jan][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]

# february
for ns_feb in ns_subreddit_feb:
    ns_subreddit_feb[ns_feb] = Counter(ns_subreddit_feb[ns_feb])
    if ns_feb in ns_sr_rep:
        ns_sr_rep[ns_feb] = [ns_sr_rep[ns_feb][idx] + ns_subreddit_feb[ns_feb][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]   
    else:
        ns_sr_rep[ns_feb] = [ns_subreddit_feb[ns_feb][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]

# march
for ns_march in ns_subreddit_march:
    ns_subreddit_march[ns_march] = Counter(ns_subreddit_march[ns_march])
    if ns_march in ns_sr_rep:
        ns_sr_rep[ns_march] = [ns_sr_rep[ns_march][idx] + ns_subreddit_march[ns_march][subreddit_intersection[idx]] 
                               for idx in range(len(subreddit_intersection))]   
    else:
        ns_sr_rep[ns_march] = [ns_subreddit_march[ns_march][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]
     

    
# may
for ns_may in ns_subreddit_may:
    ns_subreddit_may[ns_may] = Counter(ns_subreddit_may[ns_may])
    if ns_may in ns_sr_rep:
        ns_sr_rep[ns_may] = [ns_sr_rep[ns_may][idx] + ns_subreddit_may[ns_may][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]   
    else:
        ns_sr_rep[ns_may] = [ns_subreddit_may[ns_may][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]
        
# june
for ns_jun in ns_subreddit_jun:
    ns_subreddit_jun[ns_jun] = Counter(ns_subreddit_jun[ns_jun])
    if ns_jun in ns_sr_rep:
        ns_sr_rep[ns_jun] = [ns_sr_rep[ns_jun][idx] + ns_subreddit_jun[ns_jun][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]   
    else:
        ns_sr_rep[ns_jun] = [ns_subreddit_jun[ns_jun][subreddit_intersection[idx]] 
                             for idx in range(len(subreddit_intersection))]

Now we normalize the news sources' representations using `numpy`'s normalize.

In [12]:
for ns in ns_sr_rep:
    rep = ns_sr_rep[ns]
    ns_sr_rep[ns] = preprocessing.normalize(np.array([rep]))

Then we select 500 features. Note that we need unsupervised feature selection method. Continue here: https://towardsdatascience.com/overview-of-feature-selection-methods-a2d115c7a8f7.