In [None]:
import pandas as pd
import os
from typing import List
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.decomposition import PCA
import numpy as np
from matplotlib.colors import ListedColormap
from imblearn.over_sampling import SMOTE
import seaborn as sns
import re


In [None]:
go_file = "/Users/kajolpatel/Desktop/Individual_Project/poc/go-basic.obo"

In [None]:
def parse_obo_file(file_path):
    
    data = []
    current_term = {}
    in_term_block = False
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line == '[Term]':  #starting a new term block
                if current_term:
                    data.append(current_term)
                current_term = {}
                in_term_block = True
            elif line == '':
                in_term_block = False  #end of a term block
            elif in_term_block:
                if ': ' in line:
                    key, value = line.split(': ', 1)
                    if key in current_term:  #handling multiple lines of the same key
                        if isinstance(current_term[key], list):
                            current_term[key].append(value)
                        else:
                            current_term[key] = [current_term[key], value]
                    else:
                        current_term[key] = value

    
    if current_term: #add the last term if file does not end with a newline
        data.append(current_term)

    return pd.DataFrame(data)

df = parse_obo_file(go_file)


In [None]:
df = df.rename(columns={'def': 'definition'}) 

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
df.iloc[0]['is_a']

In [None]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')


In [None]:
print(df[df['is_a'].isna() == False].shape)
print(df[df['is_a'].isna() == True].shape)

Total records = 47856

42200 records have is_a relationship present

5656 records do not have is_a relationship present

#### Excluding the records which do not have is_a

In [None]:
df = df[df['is_a'].notna()]

#### Checking how many uniquq values is_a has

In [None]:
exploded_df = df.explode('is_a')


In [None]:
exploded_df['is_a'].value_counts()

In [None]:
unique_is_a = exploded_df['is_a'].dropna().unique()

In [None]:
len(unique_is_a)

In [None]:
pd.options.display.max_rows = len(exploded_df['is_a'].value_counts())

In [None]:
exploded_df['is_a'].value_counts()

15547 unique is_a values

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

#### Checking if a GO term can have more than 9 is_a values

In [None]:
df[df['is_a'].apply(lambda x: len(x) > 9 if isinstance(x, list) else False)]

#### Converting the is_a values to only have GO term ids instead of names too :)

In [None]:
def extract_go_terms(s):
    if isinstance(s, list):
        go_terms = []
        for item in s:
          go_terms.extend(re.findall(r'GO:\d{7}', item))
        return (go_terms)
    else:
      go_term = re.findall(r'GO:\d{7}', s)
      return go_terms if len(go_terms) > 1 else go_terms[0]

In [None]:
df.head(2)

In [None]:
df = df[['id','definition','is_a']]

In [None]:
df.loc[:,'is_a'] = df['is_a'].apply(extract_go_terms)