# Read DATA

## Librerias

In [1]:
import pandas as pd
import numpy as np
import zipfile
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.svm import OneClassSVM
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial import distance

In [2]:
file = 'data/abstracts.zip'
with zipfile.ZipFile(file, 'r') as zf:
    with zf.open('abstracts.csv', 'r') as f:
        df_r = pd.read_csv(f, index_col = 0)

In [3]:
numero_filas = df_r.shape[0]
print("Papers", numero_filas)
df_r.columns

Papers 113008


Index(['cell_line', 'pubmedid', 'title', 'abstract'], dtype='object')

In [4]:
def get_cell_lines(dataframe, num):
    cell_lines = dataframe['cell_line'].unique()[:num]
    rows = dataframe[dataframe['cell_line'].isin(cell_lines)].reset_index(drop=True)
    return rows

In [5]:
#Limitar dataset 
df = get_cell_lines(df_r, 25)
print(df)

     cell_line  pubmedid                                              title  \
0    CVCL_0028  33040078  Splicing factor SF3B1 promotes endometrial can...   
1    CVCL_0028  34476599  Sirtuin 2 promotes cell stemness and MEK/ERK s...   
2    CVCL_0028  35401936  Role of the prorenin receptor in endometrial c...   
3    CVCL_0028  32431202  NLRC5 promotes cell migration and invasion by ...   
4    CVCL_0028  24526410  GRP78 mediates cell growth and invasiveness in...   
..         ...       ...                                                ...   
290  CVCL_0197  28745235  Effects of Ascorbic Acid on Tax, NF-κB and MMP...   
291  CVCL_0197  12850476  Localization of HTLV-I tax proviral DNA in mon...   
292  CVCL_0197   1373751  Human T lymphotropic virus types I- and II-spe...   
293  CVCL_0197   2398525  Molecular cloning and characterization of a cD...   
294  CVCL_0197   1380895  Induction of strong homotypic adhesion in huma...   

                                              abstr

## Preprocesamiento

In [10]:
def preprocess_text(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = tokenizer.tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if not re.match(r'\d+', word)]
    processed_text = ' '.join(tokens)
    return processed_text

df['abstract_p'] = df['abstract'].apply(preprocess_text)
#df.to_csv('data/abstracts.csv', index=False)
print(df)

     cell_line  pubmedid                                              title  \
0    CVCL_0028  33040078  Splicing factor SF3B1 promotes endometrial can...   
1    CVCL_0028  34476599  Sirtuin 2 promotes cell stemness and MEK/ERK s...   
2    CVCL_0028  35401936  Role of the prorenin receptor in endometrial c...   
3    CVCL_0028  32431202  NLRC5 promotes cell migration and invasion by ...   
4    CVCL_0028  24526410  GRP78 mediates cell growth and invasiveness in...   
..         ...       ...                                                ...   
290  CVCL_0197  28745235  Effects of Ascorbic Acid on Tax, NF-κB and MMP...   
291  CVCL_0197  12850476  Localization of HTLV-I tax proviral DNA in mon...   
292  CVCL_0197   1373751  Human T lymphotropic virus types I- and II-spe...   
293  CVCL_0197   2398525  Molecular cloning and characterization of a cD...   
294  CVCL_0197   1380895  Induction of strong homotypic adhesion in huma...   

                                              abstr

In [11]:
a = "plays"
print(preprocess_text(a))

play


In [12]:
unique_values = df['cell_line'].unique()
print(len(unique_values))

25


## TF IDF

In [13]:
corpus = df['abstract_p'].tolist()

vectorizer = TfidfVectorizer()
X_tfidf    = vectorizer.fit_transform(corpus)

## SVDD

In [15]:
category_centroids = {}
category_radii = {}

for category in df['cell_line'].unique():
    support_vectors = X_tfidf[df['cell_line'] == category]

    centroid = np.mean(support_vectors, axis=0)
    category_centroids[category] = centroid

    distances = np.linalg.norm(support_vectors - centroid, axis=1)
    
    radius = np.mean(distances)
    category_radii[category] = radius

for category, centroid in category_centroids.items():
    print(f"cell_line: {category}")
    print(f"Centro: {centroid}")
    print(f"Radio: {category_radii[category]}")
    print()

cell_line: CVCL_0028
Centro: [[0.         0.         0.         ... 0.         0.0044869  0.00701365]]
Radio: 0.9222394183005846

cell_line: CVCL_0080
Centro: [[0.         0.         0.         ... 0.         0.00642791 0.00685484]]
Radio: 0.9316592953966272

cell_line: CVCL_0081
Centro: [[0. 0. 0. ... 0. 0. 0.]]
Radio: 0.8949298686218217

cell_line: CVCL_0082
Centro: [[0. 0. 0. ... 0. 0. 0.]]
Radio: 0.8791341246784244

cell_line: CVCL_0107
Centro: [[0.         0.00663711 0.         ... 0.         0.         0.00363691]]
Radio: 0.9197991751784799

cell_line: CVCL_0110
Centro: [[0.         0.         0.00333432 ... 0.         0.         0.        ]]
Radio: 0.9321831124533062

cell_line: CVCL_0113
Centro: [[0. 0. 0. ... 0. 0. 0.]]
Radio: 0.8970772778178572

cell_line: CVCL_0115
Centro: [[0. 0. 0. ... 0. 0. 0.]]
Radio: 0.9101104428370347

cell_line: CVCL_0128
Centro: [[0. 0. 0. ... 0. 0. 0.]]
Radio: 0.840030033601982

cell_line: CVCL_0135
Centro: [[0.         0.         0.         ... 0. 

In [16]:
categorias_solapadas = []

# Itera sobre las categorías y compara las distancias entre los centros de las esferas
for category1, centroid1 in category_centroids.items():
    for category2, centroid2 in category_centroids.items():
        if category1 != category2:
            # Calcula la distancia entre los centros de las esferas
            distancia = np.linalg.norm(centroid1 - centroid2)

            # Verifica si la distancia es menor o igual al radio de la segunda esfera
            if distancia <= category_radii[category2]:
                # La categoría 1 está dentro de la categoría 2
                categorias_solapadas.append((category1, category2))

# Imprime las categorías solapadas
for categoria1, categoria2 in categorias_solapadas:
    print(f"{categoria1} está dentro de {categoria2}")

CVCL_0028 está dentro de CVCL_0080
CVCL_0028 está dentro de CVCL_0081
CVCL_0028 está dentro de CVCL_0082
CVCL_0028 está dentro de CVCL_0107
CVCL_0028 está dentro de CVCL_0110
CVCL_0028 está dentro de CVCL_0113
CVCL_0028 está dentro de CVCL_0115
CVCL_0028 está dentro de CVCL_0128
CVCL_0028 está dentro de CVCL_0135
CVCL_0028 está dentro de CVCL_0136
CVCL_0028 está dentro de CVCL_0138
CVCL_0028 está dentro de CVCL_0141
CVCL_0028 está dentro de CVCL_0144
CVCL_0028 está dentro de CVCL_0153
CVCL_0028 está dentro de CVCL_0164
CVCL_0028 está dentro de CVCL_0165
CVCL_0028 está dentro de CVCL_0171
CVCL_0028 está dentro de CVCL_0181
CVCL_0028 está dentro de CVCL_0187
CVCL_0028 está dentro de CVCL_0193
CVCL_0028 está dentro de CVCL_0195
CVCL_0028 está dentro de CVCL_0196
CVCL_0028 está dentro de CVCL_0197
CVCL_0080 está dentro de CVCL_0028
CVCL_0080 está dentro de CVCL_0081
CVCL_0080 está dentro de CVCL_0082
CVCL_0080 está dentro de CVCL_0107
CVCL_0080 está dentro de CVCL_0110
CVCL_0080 está dentr