# WEBSITE : https://www.geeksforgeeks.org/how-to-extract-data-from-common-file-formats-in-python/

# 1. Multiple Sheet Excel Files

In [None]:
# if the Excel file has a single sheet then the same method to read CSV file (pd.read_csv(‘File.xlsx’)) might work.
# But it won’t in the case of multiple sheet files as shown in the below image where there are 3 sheets( Sheet1, Sheet2, Sheet3).
# In this case, it will just return the first sheet.

# import Pandas library
import pandas as pd
  
# Read our file. Here sheet_name=1
# means we are reading the 2nd sheet or Sheet2
df = pd.read_excel('Sample1.xlsx', sheet_name = 1)
df.head()

In [None]:
# let’s read a selected column of the same sheet:


# Read only column A, B, C of all
# the four columns A,B,C,D in Sheet2
df=pd.read_excel('Sample1.xlsx',
                 sheet_name = 1, usecols = 'A : C')
df.head()

In [None]:
# Now let’s read all sheet together:

# Sheet1 contains columns A, B, C; Sheet2 contains A, B, C, D and Sheet3 contains B, D.
# We will see a simple example below on how to read all the 3 sheets together and merge them into common columns.

df2 = pd.DataFrame()
for i in df.keys():
    df2 = pd.concat([df2, df[i]], 
                    axis = 0)
display(df2)

# 2. Extract Text From Images

In [None]:
# For enabling our python program to have Character recognition capabilities, we would be making use of pytesseract OCR library.
# The library could be installed onto our python environment by executing the following command in the command interpreter of the OS

# pip install pytesseract

# We import necessary libraries. 
# The PIL Library is used to read the images
from PIL import Image
import pytesseract
  
# Read the image
image = Image.open(r'pic.png')
  
# Perform the information extraction from images
# Note below, put the address where tesseract.exe 
# file is located in your system
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
  
print(pytesseract.image_to_string(image))

# OUTPUT: GeeksforGeeks (Image is GeeksforGeeks ki hai)

# 3. Extracting text from Doc File

In [None]:
# we will extract text from the doc file using docx module.
# pip install python-docx

# Importing our library and reading the doc file
import docx
doc = docx.Document('csv/g.docx')
  
# Printing the title
print(doc.paragraphs[0].text)

# OUTPUT: My Name Aniket

In [None]:
# Then we’ll extract the different texts present(excluding the table).


# Getting all the text in the doc file
l=[doc.paragraphs[i].text for i in range(len(doc.paragraphs))]
  
# There might be many useless empty
# strings present so removing them
l=[i for i in l if len(i)!=0]
print(l)

# OUTPUT :
# [‘My Name Aniket’, ‘               Hello I am Aniket’, ‘I am giving tutorial on how to extract text from MS Doc.’, ‘Please go through it carefully.’]

In [None]:
# Now we’ll extract the table

# Since there are only one table in
# our doc file we are using 0. For multiple tables
# you can use suitable for toop
table = doc.tables[0]
  
# Initializing some empty list
list1 = []
list2 = []
  
# Looping through each row of table
for i in range(len(table.rows)):
    
    # Looping through each column of a row
    for j in range(len(table.columns)):
  
        # Extracting the required text
        list1.append(table.rows[i].cells[j].paragraphs[0].text)
  
    list2.append(list1[:])
    list1.clear()
print(list2)

# OUTPUT : [['A', 'B', 'C'], ['12', 'aNIKET', '@@@'], ['3', 'SOM', '+12&']]

In [None]:
# Extracting Data From PDF File

# The task is to extract Data( Image, text) from PDF in Python. We will extract the images from PDF files and save them using PyMuPDF library.
# First, we would have to install the PyMuPDF library using Pillow.

# pip install PyMuPDF Pillow

#  we will extract data from the pdf version of the same doc file.

# import module
import fitz
  
# Reading our pdf file
docu=fitz.open('file.pdf')
  
# Initializing an empty list where we will put all text
text_list=[]
  
# Looping through all pages of the pdf file
for i in range(docu.pageCount): 
    
  # Loading each page
  pg=docu.loadPage(i)
    
  # Extracting text from each page
  pg_txt=pg.getText('text')
    
  # Appending text to the empty list
  text_list.append(pg_txt)
    
# Cleaning the text by removing useless
# empty strings and unicode character '\u200b'
text_list=[i.replace(u'\u200b','') for i in text_list[0].split('\n') if len(i.strip()) ! = 0]
print(text_list)


# OUTPUT : [‘My Name Aniket ‘, ‘               Hello I am Aniket ‘, ‘I am giving tutorial on how to extract text from MS Doc. ‘, ‘Please go through it carefully. ‘, ‘A ‘, ‘B ‘, ‘C ‘, ’12 ‘, ‘aNIKET ‘, ‘@@@ ‘, ‘3 ‘, ‘SOM ‘, ‘+12& ‘]

In [None]:
# Extract image from PDF

# Iterating through the pages
for current_page in range(len(docu)):
    
  # Getting the images in that page
  for image in docu.getPageImageList(current_page):
      
    # get the XREF of the image . XREF can be thought of a
    # container holding the location of the image
    xref=image[0]
      
    # extract the object i.e,
    # the image in our pdf file at that XREF
    pix=fitz.Pixmap(docu,xref)
      
    # Storing the image as .png
    pix.writePNG('page %s - %s.png'%(current_page,xref))


In [None]:
# The image is stored in our current file location as in format page_no.-xref.png. In our case, its name is page 0-7.png.

# Now let’s plot view the image.
# Import necessary library
import matplotlib.pyplot as plt
  
# Read and display the image
img=plt.imread('page 0 - 7.png')
plt.imshow(img)

# OUTPUT : Aniket vadi file ma IMG 6 Ano Graph aavi jase
# REF : https://www.geeksforgeeks.org/how-to-extract-data-from-common-file-formats-in-python/

# What is XML parsing?

In [None]:
# In general, the process of reading the data from an XML file and analyzing its logical components is known as Parsing.
# Therefore, when we refer to reading a xml file we are referring to parsing the XML document.
# In this article, we would take a look at two libraries that could be used for the purpose of xml parsing.

In [None]:
# we would take a look at four different ways to read XML documents using different XML modules.

# 1. MiniDOM(Minimal Document Object Model)
# 2. BeautifulSoup alongside the lxml parser
# 3. Element tree
# 4. Simple API for XML (SAX)

# Website : https://www.studytonight.com/python-howtos/how-to-read-xml-file-in-python

In [None]:
# convert .txt to csv file

In [3]:
import pandas as pd
  
# readinag given csv file
# and creating dataframe
dataframe1 = pd.read_csv("GeeksforGeeks.txt")
  
# storing this dataframe in a csv file
dataframe1.to_csv('GeeksforGeeks.csv', index = None)