In [1]:
!pip install -q pandas torch scikit-learn
!pip install sentence-transformers transformers
!pip install nltk
    

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.0-py3-none-any.whl (275 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.0


In [2]:
import pandas as pd
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


In [3]:

# File paths for your Parquet files
file_paths = [
    '/kaggle/input/wikipedia-20230701/a.parquet',
    '/kaggle/input/wikipedia-20230701//b.parquet',
    '/kaggle/input/wikipedia-20230701/c.parquet'
]

# Read each file and store them in a list of DataFrames
dfs = [pd.read_parquet(file) for file in file_paths]

# Concatenate the DataFrames into one
df = pd.concat(dfs, ignore_index=True)

# Display the first few rows
print(df.head())

# Check the columns
print(df.columns)


         id                            title  \
0  49495844  A & B High Performance Firearms   
1   3579086                      A & C Black   
2  62397582            A & F Harvey Brothers   
3  15547032                      A & G Price   
4   8021609               A & M Karagheusian   

                                                text  \
0  A & B High Performance Firearms was a competit...   
1  A & C Black is a British book publishing compa...   
2  A & F Harvey Brothers, first Spinning Cotton M...   
3  A & G Price Limited is an engineering firm and...   
4  thumb|right|238px|A portion of the Karagheusia...   

                                          categories  
0  [Defunct firearms manufacturers, Defunct manuf...  
1  [Encyclopædia Britannica, Ornithological publi...  
2                                     [Cotton mills]  
3  [Locomotive manufacturers of New Zealand, Tham...  
4  [1904 establishments in the United States, Arm...  
Index(['id', 'title', 'text', 'categories'],

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
# Get the list of stop words
stop_words = set(stopwords.words('english'))


# Combine the fields into one text column
def combine_fields(row):
    categories = ', '.join(row['categories'])  # Convert list to string
    
    # Tokenize the text
    words = word_tokenize(row["text"])
    
    # Remove stop words
    filtered_text = ' '.join(list(filter(lambda word: word.lower() not in stop_words, words)))   
    
    return f"Title: {row['title']}. Categories: {categories}. Text: {filtered_text}"
    # return f"Title: {row['title']} Category: {row['categories']}"


# Batch processing function
def batch_process(df, batch_size=100):
    combined_results = []
    
    # Calculate the number of batches
    num_batches = (len(df) // batch_size) + (1 if len(df) % batch_size != 0 else 0)
    
    # Process each batch with tqdm for progress tracking
    for batch_num in tqdm(range(num_batches), desc="Processing Batches"):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(df))
        
        # Get the current batch
        batch = df.iloc[start_idx:end_idx]
        
        # Apply the function to the batch
        batch_results = batch.apply(combine_fields, axis=1)
        
        # Append the results
        combined_results.extend(batch_results)
    
    return combined_results

batch_size = 100

# Apply the function to create a 'combined' column
tqdm.pandas()
df['combined'] = batch_process(df, batch_size=batch_size)
# 
# df['combined'] = df.progress_apply(combine_fields, axis=1)

# Preview the combined text
print(df['combined'].head())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing Batches:   0%|          | 0/11840 [00:00<?, ?it/s]

0    Title: A & B High Performance Firearms. Catego...
1    Title: A & C Black. Categories: Encyclopædia B...
2    Title: A & F Harvey Brothers. Categories: Cott...
3    Title: A & G Price. Categories: Locomotive man...
4    Title: A & M Karagheusian. Categories: 1904 es...
Name: combined, dtype: object


In [5]:
import gc

# Drop unnecessary columns
df = df.drop(columns=['title', 'text', 'categories'])

# Force garbage collection
gc.collect()

# Optionally reset index to clean up any lingering references
df = df.reset_index(drop=True)

# Check DataFrame memory usage
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1183906 entries, 0 to 1183905
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   id        1183906 non-null  object
 1   combined  1183906 non-null  object
dtypes: object(2)
memory usage: 18.1+ MB
None


In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from tqdm import tqdm
tqdm.pandas(disable=True)
# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight & effective

# Convert the combined text column to a list
text_list = df['combined'].tolist()

# Initialize a list to store embeddings
embeddings = []

# Define batch size
batch_size = 512
tqdm.pandas(disable=True)

# Use tqdm to visualize progress during batch processing
for i in tqdm(range(0, len(text_list), batch_size), desc="Generating Embeddings"):
    # Process a batch of text
    batch = text_list[i:i + batch_size]
    
    # Generate embeddings for the current batch
    batch_embeddings = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    
    # Append the batch embeddings to the main list
    embeddings.extend(batch_embeddings)

# Normalize embeddings for better similarity scores
embedding_matrix = normalize(embeddings)

# Add normalized embeddings to the DataFrame
df['embeddings'] = list(embedding_matrix)  # Convert 2D array to a list of 1D arrays for Pandas compatibility

# Print the first few rows and verify an embedding's shape
print(df.head())
print(df['embeddings'].iloc[0].shape)  # Should print (384,) for each embedding


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating Embeddings: 100%|██████████| 2313/2313 [43:46<00:00,  1.14s/it]


         id                                           combined  \
0  49495844  Title: A & B High Performance Firearms. Catego...   
1   3579086  Title: A & C Black. Categories: Encyclopædia B...   
2  62397582  Title: A & F Harvey Brothers. Categories: Cott...   
3  15547032  Title: A & G Price. Categories: Locomotive man...   
4   8021609  Title: A & M Karagheusian. Categories: 1904 es...   

                                          embeddings  
0  [0.005076993220085449, 0.014148154193647369, -...  
1  [-0.020476288057262163, -0.09883869838271596, ...  
2  [-0.08427029833660497, -0.01244955600900288, -...  
3  [-0.10381282871141384, -0.01448750310363473, 0...  
4  [-0.053385633809100275, -0.017941486034577337,...  
(384,)


In [7]:
print(df.columns)


Index(['id', 'combined', 'embeddings'], dtype='object')


In [8]:
df.head()

Unnamed: 0,id,combined,embeddings
0,49495844,Title: A & B High Performance Firearms. Catego...,"[0.005076993220085449, 0.014148154193647369, -..."
1,3579086,Title: A & C Black. Categories: Encyclopædia B...,"[-0.020476288057262163, -0.09883869838271596, ..."
2,62397582,Title: A & F Harvey Brothers. Categories: Cott...,"[-0.08427029833660497, -0.01244955600900288, -..."
3,15547032,Title: A & G Price. Categories: Locomotive man...,"[-0.10381282871141384, -0.01448750310363473, 0..."
4,8021609,Title: A & M Karagheusian. Categories: 1904 es...,"[-0.053385633809100275, -0.017941486034577337,..."


In [9]:
# # Check variance of embeddings
print(f"Variance of Embeddings: {np.var(embedding_matrix, axis=0).mean()}")


Variance of Embeddings: 0.0024311788196501563


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_articles(input_title, top_n=10):
    if input_title not in df['combined'].values:
        return "Article not found."
    
    input_idx = df.index[df['combined'] == input_title].item()
    input_embedding = embedding_matrix[input_idx]
    
    # Compute cosine similarity
    similarities = cosine_similarity([input_embedding], embedding_matrix).flatten()
    
    # Exclude the input article and get top matches
    sorted_indices = np.argsort(similarities)[::-1]
    valid_indices = [i for i in sorted_indices if i != input_idx][:top_n]
    
    return df.iloc[valid_indices]





In [15]:
# Test the recommendation system
from random import randint
for att in range(10):
    print('='*25, "sample: ", att, '='*25 )
    input_article = df['combined'][randint(0, len(df))]
    recommendations = recommend_articles(input_article)
    # print(recommendations)
    
    
    # Display recommendations
    print("input:  ", input_article.split("Categories:")[0].replace("Title: ", ""))
    for idx, i in (zip(recommendations["id"], recommendations["combined"])):
        print(idx, " : ", i.split("Categories:")[0].replace("Title: ", ""))

input:   Acrocercops didymella. 
29525464  :  Acrocercops plebeia. 
29512977  :  Acrocercops doloploca. 
29512907  :  Acrocercops irrorata. 
29513023  :  Acrocercops grammatacma. 
29525434  :  Acrocercops trapezoides. 
29525510  :  Acrocercops plectospila. 
29525158  :  Acrocercops leucomochla. 
29484246  :  Acrocercops albomaculella. 
29525532  :  Acrocercops spodophylla. 
29512960  :  Acrocercops crucigera. 
input:   Arnica cernua. 
19779845  :  Arnica spathulata. 
19779735  :  Arnica sororia. 
637284  :  Arnica. 
19735580  :  Arnica latifolia. 
46779050  :  Arnica gracilis. 
35694499  :  Arnica chamissonis. 
19379141  :  Arctostaphylos canescens. 
19779979  :  Arnica venosa. 
19735997  :  Arnica nevadensis. 
21592673  :  Ceanothus pumilus. 
input:   Bloom (store). 
36310361  :  Belle Foods. 
843787  :  Bloomingdale's. 
7394022  :  Bloom's restaurant. 
41179568  :  Best Market. 
3546474  :  Bloomin' Brands. 
23220202  :  Bloom Brothers Department Stores. 
2002297  :  Brookshire's Foo

In [11]:
from random import randint
print(len(df))
df['combined'][randint(0, len(df))]

1183906


"Title: Ahmed Mohamed Ahmed. Categories: 1980 births, Living people, Somalian footballers, Somalian football managers, Somalian expatriate sportspeople in Switzerland, Expatriate soccer coaches in the United States Virgin Islands, Expatriate footballers in Switzerland, FC Thun players, Somalia men's international footballers, Men's association football defenders, Expatriate footballers in Barbados, Somalian expatriate football managers, Somalian expatriate sportspeople in Barbados, Somalian expatriate sportspeople in the United States Virgin Islands, Place of birth missing (living people). Text: Ahmed Mohamed Ahmed also known Ahmed Gaab ( Arabic : أحمد محمد أحمد ; born 22 February 1980 ) Somali football coach former player . ==Coaching career== starting young age several positions Somali Football Federation 's observation department guidance then- Federation President Abdegani Saeed Arab , hired coaching knowledge football , became head coach Somalia national beach soccer team technica