# The_cell

In [2]:
# Define My Requirements
# The app should be able to:

# Take a user input for a protein name.
# Fetch 3D structure information from a reliable source (like the RCSB PDB or NCBI).
# Retrieve additional details from Wikipedia or NCBI.

In [None]:
# Set Up Your Environment
#Make sure you have Python installed along with any necessary libraries. You may need:

# requests for making API calls.
# json for handling JSON data.
# py3Dmol or matplotlib for 3D visualization (if applicable).

In [8]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [234]:
# Function to call user input

def get_user_input():
    return input("Enter the protein name (or type 'exit' to quit): ")


In [123]:
protein_input = get_user_input()

Enter the name of the cell or protein:  CTCF


In [None]:
# Function to fetch proteins name from data source
# Data Fetching:
# The fetch_structure_data function retrieves structure data from the RCSB PDB API.
# The fetch_wikipedia_info function fetches relevant information from Wikipedia using its API.

In [175]:
def fetch_structure_data(protein_name):
    # Fetch data from RCSB PDB
    url = f"https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/{protein_name}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data:
            pdb_id = data[0]['pdb_id'] if 'pdb_id' in data[0] else None
            rcsb_link = f"https://www.rcsb.org/structure/{pdb_id}" if pdb_id else None
            return data, rcsb_link
    return None, None

In [179]:
# function to fentch information from Wikipedia

def fetch_wikipedia_info(protein_name):
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={protein_name}&prop=extracts&exintro=&explaintext="
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
        
    return None

In [254]:
import requests

def fetch_protein_names():
    url = "https://rest.uniprot.org/uniprotkb/stream?fields=protein_name&format=tsv&query=organism_id:9606"
    response = requests.get(url)
    
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        
        # Check that there are results beyond the header row
        if len(lines) > 1:
            protein_names = []
            for line in lines[1:]:  # Skip the header row
                parts = line.split("\t")
                if len(parts) > 1:  # Check if there are enough elements after splitting
                    protein_names.append(parts[1])
            return protein_names
        
        else:
            print("No protein names found in the response.")
            return []
    else:
        print(f"Failed to fetch protein names. Status code: {response.status_code}")
        return []



In [None]:
# Test the function to ensure it handles the error properly
print(fetch_protein_names())

In [29]:
# Error Handling for User Input

In [26]:
pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [27]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.0 (from python-Levenshtein)
  Downloading levenshtein-0.26.0.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.3/374.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.0->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.0-py3-none-any.whl (9.4 kB)
Downloading rapidfuzz-3.10.1-cp312-cp312-macosx_10_13_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[

In [261]:
from fuzzywuzzy import process

def get_best_match(user_input, valid_proteins):
    # Use fuzzywuzzy to find the best match
    match_result = process.extractOne(user_input, valid_proteins)
    
    # Check if a match was found before unpacking
    if match_result is not None:
        match, score = match_result
        # Set a threshold score for acceptable matches (adjust as necessary)
        if score > 70:  # 70 is a common threshold, adjust as needed
            return match
    return None  # No suitable match found


In [263]:

def display_results(structure_data, rcsb_link, wiki_info):
    if structure_data:
        print("Structure Data:", structure_data)
        if rcsb_link:
            print("RCSB PDB Link:", rcsb_link)
    else:
        print("No structure data found.")

    if wiki_info and 'query' in wiki_info:
        pages = wiki_info['query']['pages']
        for page_id, page in pages.items():
            if 'extract' in page:
                print("Wikipedia Info:", page['extract'])
            else:
                print("No information found on Wikipedia.")


In [None]:
print ("Welcome to The Cell!")


protein_name = get_user_input()
valid_proteins = fetch_protein_names()
best_match = get_best_match(protein_name, valid_proteins)
structure_data = fetch_structure_data(protein_name)
wiki_info = fetch_wikipedia_info(protein_name)


while True:
    if protein_name.lower() == 'exit':
        break
        
    if best_match:
        print(f"Did you mean: {best_match}?")
        display_results(structure_data, rcsb_link, wiki_info)[:10]
         
    else:
        print("No matching protein found. Please check the name and try again.")
    


Welcome to The Cell!


Enter the protein name (or type 'exit' to quit):  ctcf


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
with open("output.txt", "w") as f:
    f.write(str(data))