# Priliminary analysis for MC1

In [1]:
import pandas as pd 
import numpy as np
from globals import DATA_PATH
from matplotlib import pyplot as plt

In [117]:
articles_df = pd.read_csv("../data/news_articles.csv", encoding="utf-8", sep="|", parse_dates=["publish_date"])
articles_df.set_index("articleID", inplace=True)
articles_df.sort_index(inplace=True)

sources = articles_df["source"].unique()
articles_df.head()

Unnamed: 0_level_0,source,title,author,publish_date,location,content
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,The Orb,BUMP OF PROTESTS IN ABILA IN RESPONSE TO THE C...,,2012-09-08,"ABILA, Kronos",The thousands of people striped the roads Thur...
1,The Light of Truth,ENORMOUS IPO MAKES THE BILLIONAIRE OF SANJORGE,,2013-12-18,"CENTRUM, Tethys",The president and CEO of GAStech international...
2,Everyday News,Global boxes of the charity of dren of the dis...,,1999-11-16,,The new pleas campaign in the horizon to maint...
3,The Tulip,PROMINENT Citizen dies at 65,,1994-02-18,"CENTRUM, Tethys",Friday early after four years of diminishing h...
4,News Online Today,"ELODIS, KRONOS: HELP FROM AN UNEXPECTED SOURC...",,1998-04-26,,"NOTE: This article is the second in a series, ..."


### extracting time from content

In [118]:
import dateparser
import re
# chat generated and modified
TIME_PATTERNS = [
    r'^\d{4}\b',                        # 0932, 1452 at beginning of content
    r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?', # 9:32, 14:52, 2:45 pm
    r'\b\d{1,2}\s*(?:AM|PM|am|pm)\b',      # 9 AM, 10 pm
    #r'\b\d{1,2}[:.]\d{2}\b',             # 9.32, 14.52
    r'\b\d{1,2} o\'clock\b'              # 9 o'clock
]
combined_pattern = re.compile('|'.join(TIME_PATTERNS))
def extract_time(content):
    match = re.search(combined_pattern, content)
    if match:
        time_str = match.group()
        parsed = dateparser.parse(time_str,settings={"RETURN_AS_TIMEZONE_AWARE": False})
        if parsed:
            return parsed.strftime("%H:%M")
    return np.nan

articles_df["time"] = articles_df["content"].apply(extract_time)
articles_df[articles_df["time"].notna()]


Unnamed: 0_level_0,source,title,author,publish_date,location,content,time
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,World Journal,Four people died in a burning accident of road...,,2007-03-20,,One nine years - the old boy at the beginning ...,09:00
8,Modern Rubicon,ON SCENE BLOG,,2014-01-20,,1532. An employee of Kronos - of GAStech that ...,00:00
9,Kronos Star,Breaking: Emergency at GAStech Headquarters Bu...,"Cato Rossini, Marcella Trapani",2014-01-20,,Update 1:00 PM: Police have come and gone fro...,13:00
10,Homeland Illumination,VOICES - a blog about what is important to the...,Maha Salo,2014-01-20,,1018 - A fire alarm has gone off at GAStech he...,00:00
11,Tethys News,To break off itself: The emergency to GAStech ...,,2014-01-20,,"Update, 3:05 PM: A police P2o has left the sc...",15:05
...,...,...,...,...,...,...,...
832,The Orb,POK PROTESTS the END IN the ARRESTS,,2005-04-06,"ELODIS, Kronos",Fifteen members of the guards of the organizat...,08:00
838,Tethys News,To break off itself: The emergency to GAStech ...,,2014-01-20,,"Update, 5:30 PM: We have several relationship...",17:30
841,Centrum Sentinel,VOICES - a blog on what is important to people,,2014-01-20,,1837 - Are relationships not confirmed that it...,00:00
842,Homeland Illumination,VOICES - a blog about what is important to the...,Maha Salo,2014-01-20,,1802 - Homeland Illumination correspondent Pet...,00:00


*TODO: Peform similarity analysis on text and find out which are primary source and which are secondary*


*Evidence: article 618, 711, 764 are essentially a same article, which means that there is at least two secondary sources. However, no difference can tell from timesteamps, so probably a similarity analysis can reveal secondary sources if their articles are similar to many other sources*

In [119]:
import string

# split the words of content
articles_df["words"] = articles_df["content"].apply(lambda x: np.array(x.lower().translate(str.maketrans(string.punctuation, " "*len(string.punctuation))).split()))
articles_df["words"]


articleID
0      [the, thousands, of, people, striped, the, roa...
1      [the, president, and, ceo, of, gastech, intern...
2      [the, new, pleas, campaign, in, the, horizon, ...
3      [friday, early, after, four, years, of, dimini...
4      [note, this, article, is, the, second, in, a, ...
                             ...                        
840    [a, strong, rain, did, not, stop, thousands, o...
841    [1837, are, relationships, not, confirmed, tha...
842    [1802, homeland, illumination, correspondent, ...
843    [a, heavy, rain, loved, no, thousands, of, cit...
844    [1025, the, construction, is, showing, the, si...
Name: words, Length: 845, dtype: object

In [122]:
# compute TF-IDF
vocab = np.unique(np.concatenate(articles_df["words"].values))
n_articles = len(articles_df)
n_vocab = len(vocab)
n_source = len(sources)

word_index = {str(word): idx for idx, word in enumerate(vocab)}
tf = np.zeros((len(articles_df), len(vocab)))

for doc_idx, doc in enumerate(articles_df["words"].values):
    for word in doc:
        tf[doc_idx, word_index[word]] += 1
    tf[doc_idx] /= len(doc)  # normalize by total words in doc

df = np.count_nonzero(tf > 0, axis=0)  # doc freq per term
idf = np.log(n_articles / (df + 1e-10))

tf_idf = tf * idf

In [126]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tf_idf)
i_idx, j_idx = np.where((similarity_matrix > 0.5) & (np.eye(n_articles) == 0))

source_index = {str(s): idx for idx, s in enumerate(sources)} # source: idx
source_similarity_counts = np.zeros((n_source, n_source))
for i, j in zip(i_idx, j_idx):
    source_i, source_j = source_index[articles_df.loc[i]["source"]], source_index[articles_df.loc[j]["source"]]
    source_similarity_counts[source_i, source_j] += 1
source_similarity_counts


array([[ 4.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0., 25.,  0.,  0., 15.,
         0.,  0., 20.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        13.,  0.,  0.],
       [ 0.,  0.,  0., 11.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0., 12.,  0.,  0.,  0.,  0.,  0.,  1., 15.,  0., 11.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0., 11.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0., 12.,  0.,  0.,  9.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  6.],
       [ 0., 11.,  0.,  0.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  7.,  0.,  0.,  0.,  0.,  0.,  0., 13.,  0.,  9.,
         0.,  0.,  0.],
       [ 1.,  1.,  1.,  3.,  4.,  0.,  1.,  0., 26., 22.,  0.,  0.,  1.,
         2.,  2.,  2.,  0., 13., 21., 22.,  2.,  4.,  3., 16., 23., 13.,
        18.,  3.,  9.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 25.,  0.,  0.,  0.,  0.,
         0., 15.,  0.,  0.,  0., 12.,  0.,  0., 18.,  2.,  0.,  0.,  0.,
    

In [129]:
import networkx as nx

G = nx.Graph()

# Add nodes
for i in range(n_source):
    G.add_node(i, text=sources[i])  # or use an ID/title

# Add edges for similarities above threshold
for i in range(n_source):
    for j in range(i + 1, n_source):
        count = source_similarity_counts[i][j]
        G.add_edge(i, j)

In [130]:
import matplotlib.pyplot as plt

pos = nx.spring_layout(G, seed=42)
edges = G.edges(data=True)

nx.draw(G, pos, with_labels=True, edge_cmap=plt.cm.Blues)
plt.show()

KeyError: 'weight'