In [34]:
import pandas as pd
df = pd.read_csv("enwiki-20170820.csv", nrows=500)

merged_df = df.groupby('ARTICLE_ID').agg({'TITLE': 'first', 'SECTION_TEXT': ' '.join})
merged_df.rename(columns={'SECTION_TEXT': 'FULL_TEXT'}, inplace=True)
# merged_df = merged_df.sort_values(by='TITLE')

#merged_df.to_csv('merged_articles.csv', index=False)

merged_df


Unnamed: 0_level_0,TITLE,FULL_TEXT
ARTICLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3,A,\n\n\n\n\n\n\nWriting cursive forms of A\n'''A...
25,ASCII,\n\n\n\n'''ASCII''' ( ) abbreviated from '''Am...
41,Aberdeen (disambiguation),\n'''Aberdeen''' is a city in Scotland United ...
6,Abraham Lincoln,\n\n\n\n\n\n'''Abraham Lincoln''' (; February ...
9,Academy Award for Best Production Design,\n\n\nThe '''Academy Award for Best Production...
10,Academy Awards,\n\n\n\n\nThe '''Academy Awards''' now known o...
5,Achilles,\n\nAchilles and the Nereid Cymothoe Attic red...
11,Actrius,\n\n\n\n'''''Actresses''''' (Catalan: '''''Act...
40,Ada,\n\n'''Ada''' may refer to:\n\n \n===Africa===...
31,Afroasiatic languages,\n\n\n\n'''Afroasiatic''' ('''Afro-Asiatic''')...


In [80]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

merged_df=pd.read_csv("pleasework.csv",header=None)
merged_df.columns = ['ARTICLE_ID', 'TITLE', 'FULL_TEXT']
def is_adverb(tag):
    return tag.startswith('RB')

def preprocess_text(text):
    porter_stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    
    tokens = [token.lower() for token in tokens]
 
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens]

    tokens = [token for token in tokens if token]

    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]

    lemmatized_tokens = []
    for token, tag in tagged_tokens:
        if is_adverb(tag):
            lemmatized_tokens.append(lemmatizer.lemmatize(token, pos=wordnet.ADV))
        else:
            lemmatized_tokens.append(lemmatizer.lemmatize(token))

    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

merged_df['FULL_TEXT'] = merged_df['FULL_TEXT'].apply(preprocess_text)


In [81]:
merged_df

Unnamed: 0,ARTICLE_ID,TITLE,FULL_TEXT
0,0,Anarchism,political philosophy based voluntary often sta...
1,1,Autism,disorder social interaction verbal communicati...
2,2,Albedo,percentage diffusely reflected sunlight relati...
3,3,A,writing cursive plural aes first letter first ...
4,4,Alabama,state southeastern region united bordered nort...
...,...,...,...
1110,1110,Antacid,antacid substance stomach acidity used relieve...
1111,1111,Antidiarrhoeal,drug drug medication symptomatic relief electr...
1112,1112,Áed mac Cináeda,mac born son mac king brother white chronicle ...
1113,1113,Abdul Hamid I,march sultan empire empire born younger son su...


In [82]:
merged_df.to_csv('merged_articles.csv', index=True)

In [83]:
from nltk.tokenize import word_tokenize
vocabulary = set()
for text in merged_df['FULL_TEXT']:
    tokens = word_tokenize(text)
    vocabulary.update(tokens)
sorted_vocabulary = sorted(vocabulary)

print("Vocabulary size:", len(sorted_vocabulary))
print("Sample vocabulary terms:", sorted_vocabulary[1500:2000])

Vocabulary size: 30392
Sample vocabulary terms: ['appalling', 'appanage', 'apparatus', 'apparel', 'apparent', 'apparently', 'apparition', 'appeal', 'appealing', 'appear', 'appearance', 'appease', 'appellant', 'appellate', 'appellation', 'appellee', 'append', 'appendage', 'appendectomy', 'appendice', 'appendicitis', 'appendicular', 'appendix', 'appetite', 'appetitive', 'appetizer', 'applaud', 'applause', 'apple', 'appliance', 'applicability', 'applicable', 'applicant', 'application', 'applicator', 'applied', 'apply', 'appoint', 'appointment', 'apportionment', 'appraisal', 'appraising', 'appreciable', 'appreciably', 'appreciate', 'appreciation', 'appreciative', 'apprehend', 'apprehension', 'apprehensive', 'apprentice', 'apprenticeship', 'approach', 'approaching', 'approbation', 'appropriate', 'appropriately', 'appropriateness', 'appropriation', 'approval', 'approve', 'approvingly', 'approximate', 'approximately', 'approximation', 'apraxia', 'apricot', 'apron', 'apse', 'apt', 'aptitude', 

In [93]:
vocabulary_with_ids = [(idx, term) for idx, term in enumerate(sorted_vocabulary)]
vocabulary_with_ids[:15]

Sample vocabulary terms with IDs (as list of tuples):


[(0, 'aa'),
 (1, 'aal'),
 (2, 'aardvark'),
 (3, 'aardwolf'),
 (4, 'aba'),
 (5, 'abac'),
 (6, 'abaca'),
 (7, 'abacist'),
 (8, 'abacus'),
 (9, 'abalone'),
 (10, 'abampere'),
 (11, 'abandon'),
 (12, 'abandoned'),
 (13, 'abandonment'),
 (14, 'abate')]

In [85]:
merged_df

Unnamed: 0,ARTICLE_ID,TITLE,FULL_TEXT
0,0,Anarchism,political philosophy based voluntary often sta...
1,1,Autism,disorder social interaction verbal communicati...
2,2,Albedo,percentage diffusely reflected sunlight relati...
3,3,A,writing cursive plural aes first letter first ...
4,4,Alabama,state southeastern region united bordered nort...
...,...,...,...
1110,1110,Antacid,antacid substance stomach acidity used relieve...
1111,1111,Antidiarrhoeal,drug drug medication symptomatic relief electr...
1112,1112,Áed mac Cináeda,mac born son mac king brother white chronicle ...
1113,1113,Abdul Hamid I,march sultan empire empire born younger son su...


In [92]:
from collections import Counter
def calculate_tf(document, vocabulary):
    tokens = word_tokenize(document.lower())
    term_frequency = Counter(tokens)
    tf_representation = [(term_id, term_frequency.get(term, 0)) for term_id, term in vocabulary_with_ids]
    
    return tf_representation

document_tf = [(article_id, calculate_tf(full_text, vocabulary)) for article_id, full_text in zip(merged_df['ARTICLE_ID'], merged_df['FULL_TEXT'])]
print(document_tf[0])


TF representation of the first document:
(0, [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 2), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0), (25, 0), (26, 0), (27, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 0), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (42, 0), (43, 1), (44, 0), (45, 0), (46, 0), (47, 0), (48, 0), (49, 0), (50, 3), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (56, 0), (57, 0), (58, 0), (59, 0), (60, 1), (61, 0), (62, 0), (63, 0), (64, 0), (65, 0), (66, 0), (67, 0), (68, 0), (69, 0), (70, 0), (71, 0), (72, 0), (73, 0), (74, 0), (75, 0), (76, 0), (77, 0), (78, 0), (79, 0), (80, 0), (81, 0), (82, 0), (83, 0), (84, 0), (85, 0), (86, 0), (87, 0), (88, 0), (89, 0), (90, 0), (91, 0), (92, 0), (93, 0), (94, 0), (95, 0), (96, 0), (97, 0), (98, 0), (99, 0), (100, 0), (101, 0), (102, 0), (103, 0), (104, 0), (105, 0), (106

In [94]:
print(len(document_tf[0][1]))

30392


In [95]:
document_tf_filtered = [(article_id, [(term_id, count) for term_id, count in tf_repr if count > 0]) for article_id, tf_repr in document_tf]
print(document_tf_filtered[0])

TF representation of the first document:
(0, [(13, 2), (43, 1), (50, 3), (60, 1), (108, 1), (123, 2), (148, 1), (151, 1), (177, 1), (178, 1), (179, 14), (180, 1), (183, 1), (195, 1), (201, 1), (230, 1), (259, 1), (279, 2), (283, 1), (286, 1), (292, 9), (298, 7), (299, 3), (300, 3), (301, 1), (302, 1), (326, 1), (343, 1), (349, 2), (355, 1), (425, 2), (452, 1), (455, 1), (482, 1), (483, 3), (519, 4), (520, 1), (534, 3), (535, 1), (566, 1), (623, 1), (634, 1), (654, 3), (661, 1), (689, 2), (789, 1), (832, 2), (833, 3), (857, 1), (868, 1), (876, 2), (885, 1), (888, 1), (902, 1), (904, 17), (915, 4), (920, 8), (945, 2), (1040, 10), (1049, 1), (1133, 1), (1135, 106), (1136, 152), (1138, 1), (1139, 14), (1256, 1), (1265, 1), (1279, 1), (1285, 1), (1316, 1), (1328, 1), (1329, 2), (1335, 1), (1366, 1), (1427, 1), (1504, 1), (1507, 1), (1563, 1), (1654, 1), (1660, 1), (1672, 1), (1679, 3), (1680, 1), (1711, 1), (1723, 4), (1728, 1), (1729, 5), (1747, 1), (1865, 1), (1867, 1), (1904, 5), (1905, 

In [97]:
print(len(document_tf_filtered[1][1]))

1228


In [99]:
# import numpy as np

# def calculate_idf(document_tf):
    
#     document_frequency = {}
#     for _, tf_repr in document_tf:
#         unique_terms = set(term_id for term_id, _ in tf_repr)
#         for term_id in unique_terms:
#             document_frequency[term_id] = document_frequency.get(term_id, 0) + 1
#     total_documents = len(document_tf)
#     idf_representation = {term_id: np.log10(total_documents / df) for term_id, df in document_frequency.items()}
    
#     return idf_representation


# idf_representation = calculate_idf(document_tf_filtered)

# print("IDF representation for the first few terms:")
# for term_id, idf in list(idf_representation.items())[:10]:
#     print(f"Term ID: {term_id}, IDF: {idf}")
# Function to calculate IDF for each term in the vocabulary
def calculate_idf(document_tf):

    document_frequency = {}
    for _, tf_repr in document_tf:
        unique_terms = set(term_id for term_id, _ in tf_repr)
        for term_id in unique_terms:
            document_frequency[term_id] = document_frequency.get(term_id, 0) + 1
            
    idf_representation = [(term_id, df) for term_id, df in document_frequency.items()]
    return idf_representation

idf_representation = calculate_idf(document_tf)




IDF representation for the first few terms:
Term ID: 0, Number of Documents: 1115
Term ID: 1, Number of Documents: 1115
Term ID: 2, Number of Documents: 1115
Term ID: 3, Number of Documents: 1115
Term ID: 4, Number of Documents: 1115
Term ID: 5, Number of Documents: 1115
Term ID: 6, Number of Documents: 1115
Term ID: 7, Number of Documents: 1115
Term ID: 8, Number of Documents: 1115
Term ID: 9, Number of Documents: 1115


In [100]:
print(idf_representation)

[(0, 1115), (1, 1115), (2, 1115), (3, 1115), (4, 1115), (5, 1115), (6, 1115), (7, 1115), (8, 1115), (9, 1115), (10, 1115), (11, 1115), (12, 1115), (13, 1115), (14, 1115), (15, 1115), (16, 1115), (17, 1115), (18, 1115), (19, 1115), (20, 1115), (21, 1115), (22, 1115), (23, 1115), (24, 1115), (25, 1115), (26, 1115), (27, 1115), (28, 1115), (29, 1115), (30, 1115), (31, 1115), (32, 1115), (33, 1115), (34, 1115), (35, 1115), (36, 1115), (37, 1115), (38, 1115), (39, 1115), (40, 1115), (41, 1115), (42, 1115), (43, 1115), (44, 1115), (45, 1115), (46, 1115), (47, 1115), (48, 1115), (49, 1115), (50, 1115), (51, 1115), (52, 1115), (53, 1115), (54, 1115), (55, 1115), (56, 1115), (57, 1115), (58, 1115), (59, 1115), (60, 1115), (61, 1115), (62, 1115), (63, 1115), (64, 1115), (65, 1115), (66, 1115), (67, 1115), (68, 1115), (69, 1115), (70, 1115), (71, 1115), (72, 1115), (73, 1115), (74, 1115), (75, 1115), (76, 1115), (77, 1115), (78, 1115), (79, 1115), (80, 1115), (81, 1115), (82, 1115), (83, 1115), (

In [103]:
# Function to calculate TF-IDF for each document
def calculate_tfidf(document_tf, idf_representation):
    tfidf_representation = []
    
    # Calculate TF-IDF for each document
    for article_id, tf_repr in document_tf:
        tfidf_repr = [(term_id, tf / idf) for term_id, tf in tf_repr for term_id_, idf in idf_representation if term_id == term_id_]
        tfidf_representation.append((article_id, tfidf_repr))
    
    return tfidf_representation

# Calculate TF-IDF for each document
tfidf_representation = calculate_tfidf(document_tf_filtered, idf_representation)

# Print TF-IDF representation of the first document
print("TF-IDF representation of the first document:")
print(tfidf_representation[0])


TF-IDF representation of the first document:
(0, [(13, 0.0017937219730941704), (43, 0.0008968609865470852), (50, 0.0026905829596412557), (60, 0.0008968609865470852), (108, 0.0008968609865470852), (123, 0.0017937219730941704), (148, 0.0008968609865470852), (151, 0.0008968609865470852), (177, 0.0008968609865470852), (178, 0.0008968609865470852), (179, 0.012556053811659192), (180, 0.0008968609865470852), (183, 0.0008968609865470852), (195, 0.0008968609865470852), (201, 0.0008968609865470852), (230, 0.0008968609865470852), (259, 0.0008968609865470852), (279, 0.0017937219730941704), (283, 0.0008968609865470852), (286, 0.0008968609865470852), (292, 0.008071748878923767), (298, 0.006278026905829596), (299, 0.0026905829596412557), (300, 0.0026905829596412557), (301, 0.0008968609865470852), (302, 0.0008968609865470852), (326, 0.0008968609865470852), (343, 0.0008968609865470852), (349, 0.0017937219730941704), (355, 0.0008968609865470852), (425, 0.0017937219730941704), (452, 0.0008968609865470852