## Various ways of importing data into python with pandas.

### Delimited files ( csv ,tsv)

In [1]:
# importing all the packages at first, for now we only need pandas package
import pandas as pd

# just like that ( you can pass delimiter argument if its tsv)
data=pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### from relational databases (SQL)

In [6]:
import pymysql as sql

#connecting to database on machine it could be from remote machine too
conn=sql.connect(host='localhost',user='root',password='',db="capstone")

# this step is necessary before querying database
c=conn.cursor()
# execute the command 
c.execute('select * from lab_profile;')
# fetching into rows its spits list
all_rows=c.fetchall()

#creating a dataframe with it
df=pd.DataFrame(list(all_rows),columns=["id","name","location","password"])
df.head()

Unnamed: 0,id,name,location,password
0,123,jaga,jal,thisispass


### scraping data from websites 

In [41]:
import requests,bs4

# sometimes we need customization before collecting data from website 
url='https://www.amazon.in/Samsung-Galaxy-Storage-Additional-Exchange/dp/B07KXBMYCW/ref=br_msw_pdt-3?_encoding=UTF8&smid=A14CZOWI0VEHLG&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_s=&pf_rd_r=1KZ2D3KPDEM30DF6XTCB&pf_rd_t=36701&pf_rd_p=cc9b62a5-2189-486a-89b4-4eda80243fbe&pf_rd_i=desktop'
soup = bs4.BeautifulSoup(requests.get(url).text,"lxml")

div=soup.select("a") # this step will take time and here we customize depending upon webssite

data=pd.DataFrame(div,columns=['links'])
data.head()

Unnamed: 0,links
0,"<a id=""nav-top""></a>"
1,"<a class=""skip-link"" id=""skiplink"" tabindex=""3..."
2,"<a aria-label=""Amazon"" class=""nav-logo-link"" h..."
3,"<a aria-label="""" class=""nav-sprite nav-logo-ta..."
4,"<a class=""nav-a nav-a-2 icp-link-style-2"" href..."


### Getting from API

In [49]:
import json
import pprint

# api are different but what is common is they generally spit out response on JSON format which is
# same as dictionary so we can use simple function to convert into dataframe
r=requests.get('https://www.metaweather.com/api/location/search/?query=san').text

# some api needs key and specific fromat before getting data you need to go through all the process
rdict=json.loads(r)
apidata=pd.DataFrame.from_dict(rdict,orient='columns')
apidata.head()

Unnamed: 0,latt_long,location_type,title,woeid
0,"37.777119, -122.41964",City,San Francisco,2487956
1,"32.715691,-117.161720",City,San Diego,2487889
2,"37.338581,-121.885567",City,San Jose,2488042
3,"29.424580,-98.494614",City,San Antonio,2487796
4,"36.974018,-122.030952",City,Santa Cruz,2488853


### from PDF documents

In [64]:
import PyPDF2 as pdf

#open up like a file 
pdoc=open('bank.pdf','rb')

# that PdfFileReader  method for converting that opened file into some supported format
pdfreader=pdf.PdfFileReader(pdoc)

# get num of pages to caution out before passing the page it doesn't exist at all
print(pdfreader.getNumPages())


page=pdfreader.getPage(2)

pdfdata=page.extractText()
# IDK from here onwards how to convert that string into dataframe. You can go do some string operations.
print(pdfdata)
pdoc.close()

8
 Page 3 of 8 €€€€€€€€ €€€€€€€€  11-Mar-2019 11-Mar-2019907014005183 ATM Cash-SACWJ757-NEWVIJAYNAGARJALANDHARPBIN-11/03/19 14:26:21/4464 36052,000.0014,861.67 15-Mar-2019 15-Mar-2019907408816595 PAYTMNoidaIN-15/03/19 14:14:08/4464 3605115.0014,746.67 16-Mar-2019 16-Mar-2019907510095448 PAYTMNOIDAIN-16/03/19 15:40:47/4464 3605190.0014,556.67 17-Mar-2019 17-Mar-2019907608687960 PAYTMNoidaIN-17/03/19 13:43:07/4464 3605195.0014,361.67 17-Mar-2019 17-Mar-2019907614749215 MB-IMPS CREDIT 06700450- 17/03/19 14:00:22 36051.0714,362.74 19-Mar-2019 19-Mar-2019907815014169 ONE97COMMUNICATIONSLNOIDAIN-19/03/19 20:49:36/4464 360550.0014,312.74 20-Mar-2019 20-Mar-2019907906329890 ONE97COMMUNICATIONSLNOIDAIN-20/03/19 12:04:00/4464 360550.0014,262.74 20-Mar-2019 20-Mar-2019907914246053 ONE97COMMUNICATIONSLNOIDAIN-20/03/19 20:25:13/4464 360540.0014,222.74 21-Mar-2019 21-Mar-2019908007065702 PAYTM1204770770IN-21/03/19 13:15:15/4464 360580.0014,142.74 21-Mar-2019 21-Mar-2019908012267901 ONE97COMMUNICATIO