# Working with Files in Python

## String Formatter 

In [3]:
name = 'KGP Talkie is my youtube channel name'
follower = '30k'

In [5]:
print('name', 'follower')
print(name, follower)

name follower
KGP Talkie is my youtube channel name 30k


In [6]:
ds = [('python', 50), ('tensorflow', 100), ('nlp', 200)]

In [8]:
print(ds)

[('python', 50), ('tensorflow', 100), ('nlp', 200)]


In [11]:
for info in ds:
    print(info[0], info[1])

python 50
tensorflow 100
nlp 200


In [14]:
for info in ds:
    print(f'{info[0]:{20}} {info[1]}')

python               50
tensorflow           100
nlp                  200


In [25]:
#>, <, ^
for info in ds:
    print(f'{info[0]:{20}} {info[1]:.>{5}}')

python               ...50
tensorflow           ..100
nlp                  ..200


## Working with Files 

In [44]:
file = open('data/data.txt', 'w')

data = 'this is sentence one'
file.write(data)
file.close()

In [43]:
len(data)

20

In [46]:
file = open('data/data.txt', 'a')

data = '\t this is sentence three'
file.write(data)
file.close()

In [48]:
file = open('data/data1.txt', 'a') # a if file is there then r+w otherwise w

data = 1
file.write(str(data))
file.close()

In [52]:
file = open('data/data2.txt', 'w') # a if file is there then r+w otherwise w

data = [1, 'one', 'this is two', 2.3]
for d in data:
    file.write(str(d))
    file.write(',')
file.close()

In [54]:
with open('data/data3.txt', 'w') as file:
    data = [1, 'one', 'this is two', 2.3]
    for d in data:
        file.write(str(d))
        file.write('\n')

## Working with Read mode 

In [65]:
file = open('data/data3.txt', 'r')

In [76]:
file.seek(0)
file.read().splitlines()

['1', 'one', 'this is two', '2.3']

In [83]:
file.seek(0)
file.readlines()

['1\n', 'one\n', 'this is two\n', '2.3\n']

In [84]:
file.seek(0)
data = file.read().splitlines()
file.close()

In [85]:
data

['1', 'one', 'this is two', '2.3']

In [91]:
eval(data[-1])

2.3

In [92]:
for index, d in enumerate(data):
    try:
        data[index] = eval(d)
    except:
        pass

In [93]:
data

[1, 'one', 'this is two', 2.3]

## Reading and Writing .CSV and .TSV Files with Pandas 

In [94]:
import pandas as pd

In [96]:
pd.read_csv('data/data3.txt', header= None)

Unnamed: 0,0
0,1
1,one
2,this is two
3,2.3


In [100]:
l = [(1, 'one'), (2, 'two'), (3, 'three')]
df = pd.DataFrame(l, columns=['digit', 'figure'])
df.to_csv('data/digit.csv', sep = ',', index = False)

In [101]:
pd.read_csv('data/digit.csv')

Unnamed: 0,digit,figure
0,1,one
1,2,two
2,3,three


In [102]:
pd.read_csv('data/moviereviews.tsv', sep = '\t')

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [104]:
df.to_csv('data/digit.tsv', sep = '\t', index = None)

# Reading and Writing .XLSX Files with Pandas

In [110]:
df.to_excel('data/digit_sheet.xlsx', index = None, sheet_name='digit')

In [111]:
df.to_excel('data/digit_sheet.xlsx', index = None, sheet_name='digit1')

In [113]:
writer = pd.ExcelWriter('data/digit_sheet.xlsx', engine = 'xlsxwriter')
df.to_excel(writer, index = None, sheet_name='digit1')
df.to_excel(writer, index = None, sheet_name='digit2')
writer.save()
writer.close()

## Reading and Writing .JSON Files 

JSON (JavaScript Object Notation) is a popular data format used for representing structured data. It's common to transmit and receive data between a server and web application in JSON format.

In [114]:
import json

In [118]:
data_dict = {"one": "1", "two":"2"}

In [119]:
type(data_dict)

dict

In [121]:
data_str = '{"one": "1", "two":"2"}'
type(data_str)

str

In [None]:
#load(), loads(), dump(), dumps()

In [123]:
json.loads(data_str)

{'one': '1', 'two': '2'}

In [124]:
json.dumps(data_dict)

'{"one": "1", "two": "2"}'

In [128]:
file = open('data/data.json', 'w')
json.dump(data_str, file)
file.close()

In [129]:
file = open('data/data.json', 'r')
json_data = json.load(file)
file.close

<function TextIOWrapper.close()>

In [131]:
json.loads(json_data)

{'one': '1', 'two': '2'}

## Reading Files from URL Links 

https://datahub.io/core/global-temp/r/monthly.json

https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/ecommerce.csv


In [138]:
pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/ecommerce.csv')

Unnamed: 0,ID,order_date,delivery_date
0,1,5/24/98,2/5/99
1,2,4/22/92,3/6/98
2,4,2/10/91,8/26/92
3,5,7/21/92,11/20/97
4,7,9/2/93,6/10/98
...,...,...,...
496,990,6/24/91,2/2/96
497,991,9/9/91,3/30/98
498,993,11/16/90,4/27/98
499,994,6/3/93,6/13/93


In [139]:
pd.read_json('https://datahub.io/core/global-temp/r/monthly.json')

Unnamed: 0,Date,Mean,Source
0,2016-12-06,0.7895,GCAG
1,2016-12-06,0.8100,GISTEMP
2,2016-11-06,0.7504,GCAG
3,2016-11-06,0.9300,GISTEMP
4,2016-10-06,0.7292,GCAG
...,...,...,...
3283,1880-03-06,-0.1800,GISTEMP
3284,1880-02-06,-0.1229,GCAG
3285,1880-02-06,-0.2100,GISTEMP
3286,1880-01-06,0.0009,GCAG


## Extract Text Data From PDF 

In [148]:
# !pip install PyPDF2

In [149]:
import PyPDF2 as pdf

In [150]:
file = open('data/NLP.pdf', 'rb')

In [151]:
reader = pdf.PdfFileReader(file)



In [152]:
reader

<PyPDF2.pdf.PdfFileReader at 0x2b74c4a5588>

In [153]:
help(reader)

Help on PdfFileReader in module PyPDF2.pdf object:

class PdfFileReader(builtins.object)
 |  
 |  Initializes a PdfFileReader object.  This operation can take some time, as
 |  the PDF stream's cross-reference tables are read into memory.
 |  
 |  :param stream: A File object or an object that supports the standard read
 |      and seek methods similar to a File object. Could also be a
 |      string representing a path to a PDF file.
 |  :param bool strict: Determines whether user should be warned of all
 |      problems and also causes some correctable problems to be fatal.
 |      Defaults to ``True``.
 |      ``sys.stderr``).
 |      ``True``).
 |  
 |  Methods defined here:
 |  
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  cacheGetIndirectObject(self, generation, idnum)
 |  
 |  cacheIndirectObject(self, generation, idnum, obj)
 |  
 |  decrypt(self, password)
 |      When using an encrypted / secured PDF file with the PDF Standard
 |      encryp

In [154]:
reader.getIsEncrypted()

False

In [155]:
reader.getDocumentInfo()

{'/ModDate': 'D:20060227152126Z',
 '/CreationDate': 'D:20060227151709Z',
 '/Title': '#n',
 '/Creator': 'Acrobat PDFMaker 6.0 for Word',
 '/Producer': 'Acrobat Distiller 6.0 (Windows)',
 '/Author': 's',
 '/SourceModified': 'D:20060227151632'}

In [156]:
reader.getNumPages()

19

In [157]:
page1  = reader.getPage(0).extractText()

In [158]:
page1

'Lkit: A Toolkit for Natuaral Language Interface Construction 2. Natural Language Processing (NLP) This section provides a brief history of NLP, introduces some of the main problems involved in extracting meaning from human languages and examines the kind of activities performed by NLP systems.   2.1. Background Natural language processing systems take strings of words (sentences) as their input and produce structured representations capturing the meaning of those strings as their output. The nature of this output depends heavily on the task at hand. A natural language understanding system serving as an interface to a database might accept questions in English which relate to the kind of data held by the database. In this case the meaning of the input (the output of the system) might be expressed  in terms of structured SQL queries which can be directly submitted to the database.  The first use of computers to manipulate natural languages was in the 1950s with attempts to automate tran

In [159]:
with open('data/pdf_text.txt', 'w') as file1:
    for i in range(reader.getNumPages()):
        page  = reader.getPage(i).extractText()
        file1.write(page)
        file1.write('\n')