<a href="https://colab.research.google.com/github/kritikachugh/Python_web_scrapping/blob/master/Web_scrapping_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reading csv file containing geneids

csv file link : https://docs.google.com/spreadsheets/d/1kXNUGOYNtY1Fd2kzTNZOdB5e6x60u0DySd_fO0NnGxg/edit#gid=1785941004

### Upload csv file

In [2]:
import csv 
from google.colab import files
uploaded = files.upload()

Saving genes.csv to genes.csv


### Use csv reader for reading csv file and extracting field names

In [3]:
# csv file name 
filename = "genes.csv" 
# initializing the titles and rows list 
gene_stableids = [] 
# reading csv file 
with open(filename) as csvfile: 
    # creating a csv reader object 
    csvreader = csv.reader(csvfile) 
    # extracting field names through first row 
    fields = next(csvreader) 
    # extracting each data row one by one 
    for row in csvreader: 
      if row[0] != '':
        gene_stableids.append(row[0])
# get total number of rows 
print("Total no. of rows: %d"%(csvreader.line_num)) 

Total no. of rows: 190


# Extracting NCBI web-links for all the geneids

## Concatenate the strings to form the complete NCBI url

In [4]:
ncbi_urls = []
NCBI_URL_PREFIX = 'https://www.ncbi.nlm.nih.gov/gene/?term='
for gene_id in gene_stableids:
  ncbi_url = NCBI_URL_PREFIX + gene_id
  ncbi_urls.append(ncbi_url)
print(ncbi_urls)

['https://www.ncbi.nlm.nih.gov/gene/?term=9', 'https://www.ncbi.nlm.nih.gov/gene/?term=627', 'https://www.ncbi.nlm.nih.gov/gene/?term=643', 'https://www.ncbi.nlm.nih.gov/gene/?term=1050', 'https://www.ncbi.nlm.nih.gov/gene/?term=1051', 'https://www.ncbi.nlm.nih.gov/gene/?term=1122', 'https://www.ncbi.nlm.nih.gov/gene/?term=1132', 'https://www.ncbi.nlm.nih.gov/gene/?term=1735', 'https://www.ncbi.nlm.nih.gov/gene/?term=2526', 'https://www.ncbi.nlm.nih.gov/gene/?term=2532', 'https://www.ncbi.nlm.nih.gov/gene/?term=2693', 'https://www.ncbi.nlm.nih.gov/gene/?term=2831', 'https://www.ncbi.nlm.nih.gov/gene/?term=2832', 'https://www.ncbi.nlm.nih.gov/gene/?term=2837', 'https://www.ncbi.nlm.nih.gov/gene/?term=2867', 'https://www.ncbi.nlm.nih.gov/gene/?term=2876', 'https://www.ncbi.nlm.nih.gov/gene/?term=3017', 'https://www.ncbi.nlm.nih.gov/gene/?term=3350', 'https://www.ncbi.nlm.nih.gov/gene/?term=3351', 'https://www.ncbi.nlm.nih.gov/gene/?term=3439', 'https://www.ncbi.nlm.nih.gov/gene/?term=345

# Scraping the webpages

Scrapping the webpages for extraxting information such as gene_name, gene_symbol and many more from the web-link.

## Use Beautifulsoup for parsing html data

In [6]:
import requests # for making standard html requests
from bs4 import BeautifulSoup # magical tool for parsing html data
import json # for parsing data
from pandas import DataFrame as df

## Use for loop for scraping the complete web-page

In [7]:
soups = []
for ncbi_url in ncbi_urls:
  # scrape the webpage
  page = requests.get(ncbi_url, verify=False)
  page.encoding = 'ISO-885901'
  soup = BeautifulSoup(page.text, 'html.parser')
  soups.append(soup)



## Use for loop for scraping the required information from the web-page

In [12]:
official_symbols = []
official_names = []
aliases  = []
exon_counts = []
for soup in soups: 
  # parse the webpage
  title = soup.find('title')
  title = title.text.strip()
  title = title.split('[Homo sapiens (human)]')[0]
  official_symbol = title.split(' ')[0]
  official_name = title.replace(official_symbol, '')
  official_symbols.append(official_symbol)
  official_names.append(official_name)
  
  section = soup.find('div', attrs={'class':'section'})
  if section is not None:
    section_text = section.text
    found = section_text.find('Also known as')
    alias = section_text[found:].split('\n')[1]
    aliases.append(alias)
  else:
    aliases.append('NA')
  
  exon_section = soup.find('div', attrs={'class':'gc_cont'})
  if exon_section is not None:
    exon_text = exon_section.text
    exon_found = exon_text.find('Exon count:')
    exon_count = exon_text[exon_found:].split('\n')[1]
    exon_counts.append(exon_count)
  else:
    exon_counts.append('NA')

# Writing this information into csv file

## Name the output file and define columns and rows

In [13]:
output_filename = 'gene_detail.csv'
# columns and rows of csv file  
columns = ['gene_ids', 'ncbi_urls', 'symbol', 'name', 'aliases', 'exon_counts']  
rows = []
for gene_stableid, ncbi_url, official_symbol, official_name, alias, exon_count in zip(gene_stableids, ncbi_urls, official_symbols, official_names, aliases, exon_counts):
    row = [gene_stableid, ncbi_url, official_symbol, official_name, alias, exon_count]
    rows.append(row)

## Use csv writer for writing to csv file

In [14]:
with open(output_filename, 'w') as csvfile:  
    # creating a csv writer object  
    csvwriter = csv.writer(csvfile) 
    # writing the columns
    csvwriter.writerow(columns) 
    # writing the data rows
    csvwriter.writerows(rows) 

## Download the csv file

In [15]:
from google.colab import files
files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Testing the code using single url (Trial and error space)

In [16]:
url = 'https://www.ncbi.nlm.nih.gov/gene/?term=627'
page = requests.get(url, verify=False)
page.encoding = 'ISO-885901'
soup = BeautifulSoup(page.text, 'html.parser')



In [17]:
soup

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head xmlns:xi="http://www.w3.org/2001/XInclude"><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<!-- meta -->
<meta content="isPagable:true, pageSize: 20,isZebra:true" name="ncbigrid"/><meta content="index,nofollow,noarchive" name="robots"/>
<meta content="entrez" name="ncbi_app"/><meta content="gene" name="ncbi_db"/><meta content="627[uid]" name="ncbi_term"/><meta content="full_report" name="ncbi_report"/><meta content="html" name="ncbi_format"/><meta content="20" name="ncbi_pagesize"/><meta content="default" name="ncbi_sortorder"/><meta content="1" name="ncbi_pageno"/><meta content="1" name="ncbi_resultcount"/><meta content="search" name="ncbi_op"/><meta content="full_report" name="ncbi_pdid"/><meta content="CE89EA70F65F7771_1480SID" na

In [18]:
title = soup.find('title')
title = title.text.strip()
title = title.split('[Homo sapiens (human)]')[0]
official_symbol = title.split(' ')[0]
official_name = title.replace(official_symbol, '')

In [19]:
print(official_symbol)
print(official_name)

BDNF
 brain derived neurotrophic factor 


In [23]:
section = soup.find('div', attrs={'class':'section'})
section_text = section.text
found = section_text.find('Also known as')
other_names = section_text[found:].split('\n')[1]
print(other_names)

ANON2; BULN2


In [24]:
exon_section = soup.find('div', attrs={'class':'gc_cont'})
exon_text = exon_section.text  
exon_found = exon_text.find('Exon count:')
exon_count = exon_text[exon_found:].split('\n')[1]
print(exon_count)

12
