In [1]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import time

# Load abstracts DataFrame
df = pd.read_csv(filepath_or_buffer="data/abstracts.csv", index_col=0)

[nltk_data] Downloading package punkt to /home/icarrera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/icarrera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import re
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Text cleaning function
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()    
    # Remove leading and trailing white spaces
    text = text.strip()
    # Replace multiple spaces with a single space
    text = re.sub('\s+', ' ', text)
    # Tokenize text
    words = word_tokenize(text)
    # Eliminate stopwords y stem words
    stop_words = set(stopwords.words('english'))
    stemmed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join words into sentences
    cleaned_text = ' '.join(stemmed_words)
    return cleaned_text


In [3]:
# Preprocess abstracts
df.insert(1, "abstract_pr", df['abstract'].apply(clean_text))

In [4]:
df

Unnamed: 0,index,abstract_pr,cell_line,pubmedid,title,abstract
0,0,tripl neg breast cancer tnbc highli aggress su...,CVCL_0006,35715860,Tumor-associated macrophages promote epithelia...,Triple-negative breast cancer (TNBC) is a high...
1,1,thp repres leukemia cell line regist four diff...,CVCL_0006,35373342,THP-1 reference data: Proposal of an in vitro ...,THP-1 is a representative leukemia cell line a...
2,2,pattern hydroxi n trichlorophenyl naphthamid t...,CVCL_0006,35745634,Antistaphylococcal Activities and ADME-Related...,"Pattern 1-hydroxy-N-(2,4,5-trichlorophenyl)-2-..."
3,3,seri eleven benzyl intermedi eleven target com...,CVCL_0006,36232947,Study of Biological Activities and ADMET-Relat...,A series of eleven benzylated intermediates an...
4,4,seri thirti two anilid trifluoromethyl cinnam ...,CVCL_0006,36499415,Trifluoromethylcinnamanilide Michael Acceptors...,A series of thirty-two anilides of 3-(trifluor...
...,...,...,...,...,...,...
266785,8,crispr cluster regularli interspac short palin...,CVCL_ZZ97,33969526,In vivo CRISPR screening for novel noncoding R...,CRISPR (clustered regularly interspaced short ...
266786,9,traumat injuri often result axon sever initi o...,CVCL_ZZ97,29577375,Ca2+/calmodulin-dependent protein kinase II an...,Traumatic injury often results in axonal sever...
266787,10,oxid stress implic varieti neurodegen disord a...,CVCL_ZZ97,32954502,Noradrenaline protects neurons against H<sub>2...,Oxidative stress has been implicated in a vari...
266788,11,primari blast injuri caus direct impact overpr...,CVCL_ZZ97,36200530,Development of a novel bioengineered 3D brain-...,Primary blast injury is caused by the direct i...


## Tf-idf 1 (Term frequency – Inverse document frequency)

In [6]:
vectorizer = TfidfVectorizer()
# Apply TF-IDF to "abstract_pr"
tfidf_matrix = vectorizer.fit_transform(df['abstract_pr'])
# Extract features
features = vectorizer.get_feature_names_out()
# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

## Tf-idf 2 (Term frequency – Inverse document frequency)

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer /Crear una instancia de TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer on the 'abstract_p' column/# Ajustar el vectorizador en la columna 'abstract_p
vectorizer.fit(df['abstract_pr'])
# Transform the 'abstract_p' column into TF-IDF features / Transformar la columna 'abstract_p' en características TF-IDF
tfidf_matrix = vectorizer.transform(df['abstract_pr'])
# Convert the TF-IDF matrix to a DataFrame / Convertir la matriz TF-IDF en un DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [6]:
tfidf_df

Unnamed: 0,aa,aaa,aaaf,aaatcaatatt,aab,aabcl,aac,aacocf,aacr,aacrjourn,...,zymosan,zynaxi,zyx,zyxin,zz,zzr,zzui,zzuneui,zzusahi,zzw
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## REDUCCIÓN DE DIMENSIONES CON PCA

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

 # Step 2: Reduce dimensions using PCA
pca = PCA(n_components=5)  # Specify the number of desired dimensions
tfidf_pca = pca.fit_transform(tfidf_matrix.toarray())

# Create a new DataFrame with the PCA results
df_pca = pd.DataFrame(data=tfidf_pca)

# Print the final DataFrame with TF-IDF and PCA results
print(df_pca)

               0         1         2         3         4
0       0.057996 -0.001682 -0.017167  0.018858  0.038214
1       0.019032  0.013439  0.004107  0.012282 -0.014082
2       0.005033  0.033706 -0.002056  0.030673 -0.024185
3       0.006142  0.043782 -0.021622  0.018907 -0.012791
4      -0.006364  0.057346 -0.015441  0.029560 -0.012861
...          ...       ...       ...       ...       ...
266785  0.120181  0.280024 -0.166574  0.146169 -0.292266
266786 -0.501375 -0.412417 -0.129923 -0.078504 -0.073478
266787 -0.162225  0.164514  0.423831  0.029649  0.103834
266788 -0.112572  0.187052  0.314679 -0.272013  0.093490
266789 -0.059619  0.149818 -0.087470  0.015213  0.187121

[266790 rows x 5 columns]


In [8]:
# Concatenate the PCA results with the original DataFrame
df_final = pd.concat([df['cell_line'], df_pca], axis=1)
df_final

Unnamed: 0,cell_line,0,1,2,3,4
0,CVCL_0006,0.057996,-0.001682,-0.017167,0.018858,0.038214
1,CVCL_0006,0.019032,0.013439,0.004107,0.012282,-0.014082
2,CVCL_0006,0.005033,0.033706,-0.002056,0.030673,-0.024185
3,CVCL_0006,0.006142,0.043782,-0.021622,0.018907,-0.012791
4,CVCL_0006,-0.006364,0.057346,-0.015441,0.029560,-0.012861
...,...,...,...,...,...,...
266785,CVCL_ZZ97,0.120181,0.280024,-0.166574,0.146169,-0.292266
266786,CVCL_ZZ97,-0.501375,-0.412417,-0.129923,-0.078504,-0.073478
266787,CVCL_ZZ97,-0.162225,0.164514,0.423831,0.029649,0.103834
266788,CVCL_ZZ97,-0.112572,0.187052,0.314679,-0.272013,0.093490


## SVDD APLICANDO PCA

In [9]:
# aplicando pca
from sklearn.svm import OneClassSVM
import numpy as np

# Step 3: Apply SVDD for outlier detection and subject identification

subjects = df['cell_line'].unique()

subject_centers_pca = {}
subject_radii_pca = {}

for subject in subjects:
    subject_indices = df[df['cell_line'] == subject].index

    # Extract TF-IDF vectors for the subject's texts
    subject_tfidf_matrix_pca = tfidf_pca[subject_indices]

    # Apply SVDD (OneClassSVM) for the subject's texts
    model = OneClassSVM(kernel='rbf', gamma='scale')
    model.fit(subject_tfidf_matrix_pca)

    # Filter inliers for the subject
    inlier_indices = subject_indices[model.predict(subject_tfidf_matrix_pca) == 1]
    inliers_df = df.loc[inlier_indices]

    # Calculate the center of the vectors for the subject
    subject_center_pca = np.mean(subject_tfidf_matrix_pca, axis=0)
    subject_centers_pca[subject] = subject_center_pca

    # Calculate the radius of the sphere that describes the subject
    subject_radius_pca = np.max(np.linalg.norm(subject_tfidf_matrix_pca - subject_center_pca, axis=1))
    subject_radii_pca[subject] = subject_radius_pca

# Filter the DataFrame to include only the inliers
df_inliers = df.loc[inliers_df.index]

## CENTROIDES CON PCA

In [10]:
import pandas as pd
import numpy as np

# Crear una lista para almacenar los resultados
result_data = []

# Llenar la lista con los resultados
for subject in subjects:
    center = np.squeeze(subject_centers_pca[subject])
    radius = np.squeeze(subject_radii_pca[subject])
    result_data.append({'cell_line': subject, 'center': center, 'radius': radius})

# Crear el dataframe a partir de la lista de resultados
result_df_pca = pd.DataFrame(result_data)

# Imprimir el dataframe de resultados
result_df_pca


Unnamed: 0,cell_line,center,radius
0,CVCL_0006,"[0.008978779111048439, 0.023425353761767027, -...",0.070312
1,CVCL_0024,"[0.02337680012648178, -0.011364185264599026, -...",0.052139
2,CVCL_0043,"[0.011487527481506488, 0.025402494463191438, -...",0.043485
3,CVCL_0046,"[0.015073659155474158, 0.014286755609635382, 0...",0.053614
4,CVCL_0067,"[0.014743355613286553, 0.010479769992344644, 0...",0.074357
...,...,...,...
21839,CVCL_ZZ76,"[0.013231947428114926, -0.05494187680642152, -...",0.894384
21840,CVCL_ZZ79,"[-0.0006030672469505027, -0.010575210762174872...",0.889204
21841,CVCL_ZZ83,"[-0.006925923016179529, -0.0005921389329829629...",0.892903
21842,CVCL_ZZ92,"[0.00901969607112853, -0.023780498723534913, -...",0.872042


## MATRIZ DISTANCIA CON PCA

In [11]:
import numpy as np
from scipy.spatial.distance import euclidean

n = len(result_data)  # Number of observations
distance_matrix = np.zeros((n, n))  # Initialize an empty distance matrix

# Calculate pairwise distances using a for loop
for i in range(n):
    for j in range(i + 1, n):
        center_i = np.asarray(result_data[i]['center'])
        center_j = np.asarray(result_data[j]['center'])
        distance = euclidean(center_i.flatten(), center_j.flatten())                  # Euclidean distance between flattened 'center' values
        distance_matrix[i, j] = distance
        distance_matrix[j, i] = distance

# Print the distance matrix
print(distance_matrix)


[[0.00000000e+00 4.22834569e-02 9.39595392e-03 ... 3.59178986e-02
  5.18959098e-02 3.59178986e-02]
 [4.22834569e-02 0.00000000e+00 4.17688219e-02 ... 3.29348369e-02
  3.68771553e-02 3.29348369e-02]
 [9.39595392e-03 4.17688219e-02 0.00000000e+00 ... 3.58269400e-02
  5.28737856e-02 3.58269400e-02]
 ...
 [3.59178986e-02 3.29348369e-02 3.58269400e-02 ... 0.00000000e+00
  4.38874887e-02 9.38804520e-17]
 [5.18959098e-02 3.68771553e-02 5.28737856e-02 ... 4.38874887e-02
  0.00000000e+00 4.38874887e-02]
 [3.59178986e-02 3.29348369e-02 3.58269400e-02 ... 9.38804520e-17
  4.38874887e-02 0.00000000e+00]]


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

# Compute the linkage matrix
Z = linkage(distance_matrix, method='average')

# Plot the dendrogram
plt.figure(figsize=(16, 9))
dendrogram(Z)
plt.xlabel('Subjects')
plt.ylabel('Distance')
plt.title('Dendrogram')
plt.show()


  Z = linkage(distance_matrix, method='average')


RecursionError: maximum recursion depth exceeded while getting the str of an object

<Figure size 1600x900 with 0 Axes>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

# Compute the linkage matrix
Z = linkage(distance_matrix, method='average')

# Plot the dendrogram
plt.figure(figsize=(1, 6))
dendrogram(Z)
plt.xlabel('Subjects')
plt.ylabel('Distance')
plt.title('Dendrogram')
plt.show()


  Z = linkage(distance_matrix, method='average')
