In [1]:
import pandas as pd 
from collections import defaultdict 
import re

In [2]:
def normalizer(s):
    s = s.lower()
    s = re.sub(r'[^\w\s]', ' ', s) 
    s = re.sub(r'\s+', ' ', s)
    return s

In [3]:
print(normalizer("This is a    test!01"))

this is a test 01


In [4]:
data = pd.read_csv("W1/data/all_data.csv")

In [5]:
data

Unnamed: 0,id,url,title,text
0,4051460,https://en.wikipedia.org/wiki/Rank%20Insignia%...,Rank Insignia of the Islamic Revolutionary Gua...,This is a list of rank insignia used by the Is...
1,4183,https://en.wikipedia.org/wiki/Botany,Botany,"Botany, also called plant science (or plant sc..."
2,4070755,https://en.wikipedia.org/wiki/Russell%27s%20te...,Russell's teapot,"Russell's teapot is an analogy, formulated by ..."
3,4097790,https://en.wikipedia.org/wiki/I%20Am%20the%20R...,I Am the Resurrection (album),I Am the Resurrection: A Tribute to John Fahey...
4,4071549,https://en.wikipedia.org/wiki/Rolling%20hash,Rolling hash,A rolling hash (also known as recursive hashin...
5,3466,https://en.wikipedia.org/wiki/Brunei,Brunei,"Brunei ( , ), formally Brunei Darussalam (, Ja..."
6,4053264,https://en.wikipedia.org/wiki/The%20Patriotic%...,The Patriotic Traitors,The Patriotic Traitors: A History of Collabora...
7,4092916,https://en.wikipedia.org/wiki/Nick%20Gillingham,Nick Gillingham,"Nicholas Gillingham, (born 22 January 1967) i..."
8,4096543,https://en.wikipedia.org/wiki/Herbert%20M.%20A...,Herbert M. Allison,"Herbert Monroe Allison, Jr. (August 2, 1943 – ..."
9,4058979,https://en.wikipedia.org/wiki/Gordon%20Durie,Gordon Durie,Gordon Scott Durie (born 6 December 1965 in Pa...


In [6]:
def split_text(text):
    text = normalizer(text)
    return text.split()

def create_vocabulary(data: pd.DataFrame) -> dict:
    vocabulary = set() 
    for idx, row in data.iterrows():
        word_list = split_text(row['text'])
        vocabulary.update(word_list)
    return sorted(list(vocabulary))
def get_articles(data: pd.DataFrame) -> dict:
    return data['title'].to_list()

In [7]:
vocab = create_vocabulary(data)

In [8]:
columns = get_articles(data)
rows = create_vocabulary(data)
df = pd.DataFrame(0, columns=columns, index=rows)

In [9]:
df

Unnamed: 0,Rank Insignia of the Islamic Revolutionary Guard Corps,Botany,Russell's teapot,I Am the Resurrection (album),Rolling hash,Brunei,The Patriotic Traitors,Nick Gillingham,Herbert M. Allison,Gordon Durie,...,Mike Junkin,Court of appeals (disambiguation),Isiro,Bishop,Now That's What I Call Music! 21,Tourism in the Isle of Man,Gauss (disambiguation),Service High School,Journal of Atmospheric and Oceanic Technology,Binary prefix
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ḥanīfa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
啞陳,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
文杜陵,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
沙胡重,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
for idx, row in data.iterrows():
    word_list = split_text(row['text'])
    for word in word_list:
        df.at[word, row['title']] += 1

In [11]:
df

Unnamed: 0,Rank Insignia of the Islamic Revolutionary Guard Corps,Botany,Russell's teapot,I Am the Resurrection (album),Rolling hash,Brunei,The Patriotic Traitors,Nick Gillingham,Herbert M. Allison,Gordon Durie,...,Mike Junkin,Court of appeals (disambiguation),Isiro,Bishop,Now That's What I Call Music! 21,Tourism in the Isle of Man,Gauss (disambiguation),Service High School,Journal of Atmospheric and Oceanic Technology,Binary prefix
0,0,0,0,0,2,2,0,0,0,3,...,0,0,0,0,0,1,0,0,0,2
000,0,4,0,0,0,9,0,0,0,1,...,0,0,1,0,0,4,0,0,0,0
0001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
006,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ḥanīfa,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
啞陳,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
文杜陵,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
沙胡重,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df.to_csv("W1/data/TD.csv")