# The_cell

In [2]:
# Define My Requirements
# The app should be able to:

# Take a user input for a protein name.
# Fetch 3D structure information from a reliable source (like the RCSB PDB or NCBI).
# Retrieve additional details from Wikipedia or NCBI.

In [None]:
# Set Up Your Environment
#Make sure you have Python installed along with any necessary libraries. You may need:

# requests for making API calls.
# json for handling JSON data.
# py3Dmol or matplotlib for 3D visualization (if applicable).

In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Function to call user input

def get_user_input():
    return input("Enter the protein name (or type 'exit' to quit): ")


In [5]:
protein_input = get_user_input()

Enter the protein name (or type 'exit' to quit):  p53


In [None]:
# Function to fetch proteins name from data source
# Data Fetching:
# The fetch_structure_data function retrieves structure data from the RCSB PDB API.
# The fetch_wikipedia_info function fetches relevant information from Wikipedia using its API.

In [50]:
def fetch_wikipedia_info(protein_name):
    """
    Fetch summary and link from Wikipedia for a given protein name.
    
    Parameters:
        protein_name (str): The name of the protein to search for.
        
    Returns:
        dict: A dictionary containing the summary and the Wikipedia link.
    """
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{protein_name.replace(' ', '_')}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return {
            "summary": data.get("extract", "No summary available."),
            "link": data.get("content_urls", {}).get("desktop", {}).get("page", "No link available.")
        }
    else:
        print(f"Failed to fetch Wikipedia info. Status code: {response.status_code}")
        return {}

In [78]:
import requests

def fetch_protein_names():
    url = "https://rest.uniprot.org/uniprotkb/stream?fields=protein_name&format=tsv&query=organism_id:9606"
    response = requests.get(url)
    
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        protein_names = [line.split("\t")[1] for line in lines[1:] if len(line.split("\t")) > 1]

        return protein_names  # Return all protein names without filtering
    else:
        print(f"Failed to fetch protein names. Status code: {response.status_code}")
        return []

In [None]:
# Test the function to ensure it handles the error properly
print(fetch_protein_names())

In [29]:
# Error Handling for User Input

In [13]:
pip install fuzzywuzzy


Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install python-Levenshtein

Note: you may need to restart the kernel to use updated packages.


In [88]:
from fuzzywuzzy import process

def get_best_matches(user_input, valid_proteins, num_matches=5):
    """
    Find the best matches for the user input from the list of valid protein names.

    Parameters:
        user_input (str): The protein name input by the user.
        valid_proteins (list): List of valid protein names.
        num_matches (int): Number of best matches to return.

    Returns:
        list: A list of the best matching protein names or a message if no matches are found.
    """
    # Use fuzzywuzzy to find the best matches
    match_results = process.extract(user_input, valid_proteins, limit=num_matches)
    
    # Filter matches based on a score threshold (e.g., > 70)
    best_matches = [match for match, score in match_results if score > 70]
    
    if best_matches:
        return best_matches  # Return a list of matches
    else:
        return ["No suitable matches found. Please check the protein name and try again."]


In [90]:
def display_results(protein_name):
    """
    Fetch and display information from Wikipedia about the given protein.

    Parameters:
        protein_name (str): The name of the protein.
    """
    # Fetch the wiki information
    wiki_info = fetch_wikipedia_info(protein_name)  # Assuming this function fetches the relevant info

    if wiki_info:
        # Extract summary and link
        summary = wiki_info.get("summary", "No summary available.")
        wiki_link = wiki_info.get("link", "No link available.")
        
        print(f"\nInformation for {protein_name}:")
        
        # Split the summary into lines and show the first 10
        lines = summary.splitlines()
        for line in lines[:10]:  # Show the first 10 lines
            print(line.strip())
        
        # Display the link to the Wikipedia page
        print(f"Read more at: {wiki_link}")
    else:
        print("No Wikipedia information found.")

In [84]:
def display_results(protein_name):
    """
    Fetch and display the first 5 lines of information from Wikipedia about the given protein.

    Parameters:
        protein_name (str): The name of the protein.
    """
    # Fetch the wiki information
    wiki_info = fetch_wikipedia_info(protein_name)  # Fetch the relevant info

    if wiki_info:
        # Extract summary and link
        summary = wiki_info.get("summary", "No summary available.")
        wiki_link = wiki_info.get("link", "No link available.")
        
        print(f"\nInformation for {protein_name}:")
        
        # Split the summary into lines
        lines = summary.splitlines()
        
        # Display only the first 5lines of the summary
        num_lines_to_display = min(5, len(lines))  # Ensure we don't exceed the available lines
        for line in lines[:num_lines_to_display]:  
            print(line.strip())
        
        # Display the link to the Wikipedia page
        print(f"Read more at: {wiki_link}")
    else:
        print("No Wikipedia information found.")

In [None]:
###  with open("output.txt", "w") as f:
###    f.write(str(data))

In [None]:
print ("Welcome to The Cell!")


protein_name = get_user_input()
valid_proteins = fetch_protein_names()
best_matches = get_best_match(protein_name, valid_proteins)



while True:
    if protein_name.lower() == 'exit':
        break
        
    print("Best matches found:")
    for match in best_matches:  # Iterate through the list of matches
        display_results(match)  # Call the function for more details on each match
            
    # Exit the loop if no matches are found
    if best_matches[0] == "No suitable matches found. Please check the protein name and try again.":
        break
        

Welcome to The Cell!


Enter the protein name (or type 'exit' to quit):  p53


Best matches found:

Information for N:
N, or n, is the fourteenth letter of the Latin alphabet, used in the modern English alphabet, the alphabets of other western European languages, and others worldwide. Its name in English is en, plural ens.
Read more at: https://en.wikipedia.org/wiki/N

Information for o:
O, or o, is the fifteenth letter and the fourth vowel letter of the Latin alphabet, used in the modern English alphabet, the alphabets of other western European languages and others worldwide. Its name in English is o, plural oes.
Read more at: https://en.wikipedia.org/wiki/O
Failed to fetch Wikipedia info. Status code: 400
No Wikipedia information found.

Information for s:
S, or for lowercase, s, is the nineteenth letter of the Latin alphabet, used in the English alphabet, the alphabets of other western European languages and other latin alphabets worldwide. Its name in English is ess, plural esses.
Read more at: https://en.wikipedia.org/wiki/S

Information for u:
U, or u, is t