Working with Text Files

In [3]:
# Read the content of the text file
with open('sample.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()
print("Raw Text:\n", text_data)

Raw Text:
 Hello, this is a sample text file.
This is the second line.


In [5]:
# Store in another file
with open('stored_text.txt', 'w', encoding='utf-8') as file:
    file.write(text_data)

Working with CSV Files

In [8]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('reviews.csv')

# Extract Review Column (first 10 rows)
print("Reviews:\n", df['Review'].head())

# Save the reviews column to a text file
df['Review'].to_csv('stored_reviews.txt', index=False, header=False)

Reviews:
 0    The product is amazing!
1     Worst experience ever!
Name: Review, dtype: object


In [10]:
# Extract ID column (first 10 rows)
print("ID:\n", df['ID'].head())

ID:
 0    1
1    2
Name: ID, dtype: int64


Working with Excel Files

In [12]:
# Read the Excel file
df_excel = pd.read_excel('reviews.xlsx')
print("First two rows:\n", df_excel.head(2))

# Save the first two rows to a text file
df_excel.head(2).to_csv('extracted_excel.txt', index=False)

First two rows:
    ID                   Review
0   1  The product is amazing!
1   2   Worst experience ever!


Working with JSON Files

In [15]:
import json

# Read the JSON file
with open('social_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
print("Extracted City:", data['city'])

# Store the extracted city to a file
with open('stored_city.txt', 'w', encoding='utf-8') as file:
    file.write(data['city'])

Extracted City: New York


In [17]:
# Extract Comment
print("Extracted Comment:", data['comment'])

Extracted Comment: Loving this new AI tool!


Working with XML Files

In [20]:
import xml.etree.ElementTree as ET

# Parse the XML file
tree = ET.parse('news.xml')
root = tree.getroot()

for article in root.findall('article'):
    title = article.find('content').text
    print("Extracted Content:", title)

# Store the extracted title to a file
with open('stored_content.txt', 'w', encoding='utf-8') as file:
    for article in root.findall('article'):
        title = article.find('content').text
        file.write(title + '\n')

Extracted Content: AI is playing a key role in automation...


In [24]:
# Extract Title
for article in root.findall('article'):
    title = article.find('title').text
    print("Extracted Title:", title)

Extracted Title: AI is transforming industries


Working with PDF Documents

In [27]:
pip install PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [29]:
import PyPDF2

# Read the PDF file
with open('document.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())

# Print the extracted text
print("Extracted PDF Text:\n", text)

# Store the extracted text in a file
with open('stored_pdf_text.txt', 'w', encoding='utf-8') as output:
    output.write(text)

Extracted PDF Text:
 This is a sample PDF document.  
AI is transforming industries and automation.  


Exercise

In [36]:
# Extract all pages of Business_Proposal.pdf
with open('Business_Proposal.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())

# Print the extracted text
print("Extracted PDF Text:\n", text)

Extracted PDF Text:
 Business Proposal  
The Revolution is Coming  
Leverage agile frameworks to provide a robust synopsis for high level  
overviews. Iterative approaches to corporate strategy foster collaborative  
thinking to further the overall value proposition. Organically grow the  
holistic world view of disruptive innovation via workplace diversity and  
empowerment.  
Bring to the table win -win survival strategies to ensure proactive  
domination. At the end of the day, going forward, a new normal that has  
evolved from generation X is on the runway heading towards a streamlined  
cloud solution. User generated content in real -time will have multi ple 
touchpoints for offshoring.  
Capitalize on low hanging fruit to identify a ballpark value added activity to  
beta test. Override the digital divide with additional clickthroughs from  
DevOps. Nanotechnology immersion along the information highway will  
close the loop on focusing solely on the bottom line.  
Podcasting op

In [38]:
# Extract all pages of Business_Proposal.pdf, separated by page
with open('Business_Proposal.pdf', 'rb') as f:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(f)

    # Get the total number of pages
    total_pages = len(pdf_reader.pages)

    # Iterate over all pages and extract text
    for page_number in range(total_pages):
        page = pdf_reader.pages[page_number]
        page_text = page.extract_text()

        # Print the extracted text for each page
        print(f"Page {page_number + 1}:\n{page_text}\n")

Page 1:
Business Proposal  
The Revolution is Coming  
Leverage agile frameworks to provide a robust synopsis for high level  
overviews. Iterative approaches to corporate strategy foster collaborative  
thinking to further the overall value proposition. Organically grow the  
holistic world view of disruptive innovation via workplace diversity and  
empowerment.  
Bring to the table win -win survival strategies to ensure proactive  
domination. At the end of the day, going forward, a new normal that has  
evolved from generation X is on the runway heading towards a streamlined  
cloud solution. User generated content in real -time will have multi ple 
touchpoints for offshoring.  
Capitalize on low hanging fruit to identify a ballpark value added activity to  
beta test. Override the digital divide with additional clickthroughs from  
DevOps. Nanotechnology immersion along the information highway will  
close the loop on focusing solely on the bottom line.  
Podcasting operational cha

In [42]:
#Extract Page 2 of Business_Proposal.pdf
with open('Business_Proposal.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    page_two = reader.pages[1]
    page_two_extract = page.extract_text()

print("Page 2:\n", page_two_extract)

Page 2:
 AUTHORS:  
Amy Baker, Finance Chair, x345, abaker@ourcompany.com  
Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com  
Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com  
