In [57]:
# Pandas
 
# pandas is a fast, powerful, flexible and easy to use open source data analysis and
# manipulation tool, built on top of the Python programming language.
 
# Pandas is the backbone of most data science projects.
 
'''
Pandas is an open-source Python library providing high-performance,
easy-to-use data structures and data analysis tools for the Python
programming language. Python with Pandas is used in a wide range of
fields including academic and commercial domains including finance,
economics, Statistics, analytics, etc.
 
In 2008, developer Wes McKinney started developing pandas when in need
of high performance, flexible tool for analysis of data.
 
Prior to Pandas, Python was majorly used for data munging and preparation.
It had very little contribution towards data analysis.
Pandas solved this problem. Using Pandas, we can accomplish five typical
steps in the processing and analysis of data, regardless of the
origin of data — load, prepare, manipulate, model, and analyze.
 
Key Features of Pandas
 
a.) Tools for loading data into in-memory data objects from different file formats.
 
b.) Data alignment and integrated handling of missing data.
 
c.) Label-based slicing, indexing and subsetting of large data sets.
 
d.) Columns from a data structure can be deleted or inserted.
 
e.) Group by data for aggregation and transformations.
 
f.) High performance merging and joining of data.
 
g.) Time Series functionality.
 
 
Numpy vs Pandas
 
a.) The Pandas module mainly works with the tabular data, whereas the NumPy module works with
the numerical data.
 
b.) The Pandas provides some sets of powerful tools like DataFrame and Series that mainly used
for analyzing the data, whereas in NumPy module offers a powerful object called Array.
 
c.) The Pandas covered the broader application because it is mentioned in 73 company stacks and
46 developer stacks, whereas in NumPy, 62 company stacks and 32 developer stacks are being
mentioned.
 
d.) Pandas has a better performance for 500K rows or more.  NumPy has a better performance for
50K rows or less.
 
'''
 
# Pandas
 
# a.) Series  --> is a 1D labelled data
 
# b.) DataFrame --> represent the data in tabular format



'\nPandas is an open-source Python library providing high-performance,\neasy-to-use data structures and data analysis tools for the Python\nprogramming language. Python with Pandas is used in a wide range of\nfields including academic and commercial domains including finance,\neconomics, Statistics, analytics, etc.\n \nIn 2008, developer Wes McKinney started developing pandas when in need\nof high performance, flexible tool for analysis of data.\n \nPrior to Pandas, Python was majorly used for data munging and preparation.\nIt had very little contribution towards data analysis.\nPandas solved this problem. Using Pandas, we can accomplish five typical\nsteps in the processing and analysis of data, regardless of the\norigin of data — load, prepare, manipulate, model, and analyze.\n \nKey Features of Pandas\n \na.) Tools for loading data into in-memory data objects from different file formats.\n \nb.) Data alignment and integrated handling of missing data.\n \nc.) Label-based slicing, ind

In [58]:
import numpy as np
import pandas as pd
import openpyxl


In [59]:
arr = np.array([10, 20, 30, 40, 50])
print(arr)

[10 20 30 40 50]


In [60]:
arr = [1, 2, 3, 4, 5]
s = pd.Series(arr)
print(type(s))
print(s)

<class 'pandas.core.series.Series'>
0    1
1    2
2    3
3    4
4    5
dtype: int64


In [61]:
s = pd.Series(arr, index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(s['a'])
print(s.iloc[0])

a    1
b    2
c    3
d    4
e    5
dtype: int64
1
1


In [62]:
# DataFrame
data = {'Name': ['Tom', 'Jerry', 'Mickey', 'Mini'],
        'Age': [28, 34, 29, 42]}
df = pd.DataFrame(data)
print(df)


     Name  Age
0     Tom   28
1   Jerry   34
2  Mickey   29
3    Mini   42


In [63]:
emp_data = {'Name': ['Tom', 'Jerry', 'Mickey', 'Mini'], 'Age': [28, 34, 29, 42]}
print(emp_data)


{'Name': ['Tom', 'Jerry', 'Mickey', 'Mini'], 'Age': [28, 34, 29, 42]}


In [64]:
data = pd.DataFrame(emp_data)
print(type(data))
print(data)

<class 'pandas.core.frame.DataFrame'>
     Name  Age
0     Tom   28
1   Jerry   34
2  Mickey   29
3    Mini   42


In [65]:
emp_data = {'Name': ['Tom', 'Jerry', 'Mickey', 'Mini'], 'Age': [28, 34, 29, 42]}
data = pd.DataFrame(emp_data, index=['rank1', 'rank2', 'rank3', 'rank4'])
print(data)

         Name  Age
rank1     Tom   28
rank2   Jerry   34
rank3  Mickey   29
rank4    Mini   42


In [66]:
data.shape

(4, 2)

In [67]:
for i in data:
    print(i)

Name
Age


In [68]:
data.columns

Index(['Name', 'Age'], dtype='object')

In [69]:
print(data.Name.values)

['Tom' 'Jerry' 'Mickey' 'Mini']


In [70]:
data[['Name', 'Age']]

Unnamed: 0,Name,Age
rank1,Tom,28
rank2,Jerry,34
rank3,Mickey,29
rank4,Mini,42


In [71]:
# locate row by index

data.loc['rank1']

Name    Tom
Age      28
Name: rank1, dtype: object

In [72]:
# data.range('yyyymmdd', periods=10)
dates = pd.date_range('20210101', periods=10)
dates

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D')

In [73]:
df = pd.DataFrame(np.random.rand(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-01-01,0.14489,0.196298,0.370795,0.927151
2021-01-02,0.630267,0.618875,0.848007,0.598516
2021-01-03,0.66566,0.62667,0.494317,0.028689
2021-01-04,0.217685,0.0466,0.456141,0.192345
2021-01-05,0.767268,0.571329,0.736081,0.83696
2021-01-06,0.798342,0.811145,0.425513,0.947585
2021-01-07,0.982514,0.533214,0.803808,0.219324
2021-01-08,0.964802,0.838129,0.00085,0.586761
2021-01-09,0.943248,0.923705,0.433031,0.621856
2021-01-10,0.617104,0.610305,0.722308,0.810854


In [74]:
emp_info = {'Id':[101,102,103,104,105,106] , 'Name':['John','Smith','Nurul','Tom','Harry','Smith']
            , 'Salary':[1000,2000,3000,1200,2800,3200]
            ,'Dept':['Finance','Sales','HR','IT','Marketing','Sales']}
data = pd.DataFrame(emp_info)
data


Unnamed: 0,Id,Name,Salary,Dept
0,101,John,1000,Finance
1,102,Smith,2000,Sales
2,103,Nurul,3000,HR
3,104,Tom,1200,IT
4,105,Harry,2800,Marketing
5,106,Smith,3200,Sales


In [75]:
data[(data['Salary'] > 1500) & (data['Dept'] == 'Sales')]

Unnamed: 0,Id,Name,Salary,Dept
1,102,Smith,2000,Sales
5,106,Smith,3200,Sales


In [76]:
# Pandas
# Pandas is a powerful and flexible Python package that allows you to work with labeled and
# time series data. It also provides statistics methods, enables plotting, and more.
# One crucial feature of pandas is its ability to write and read Excel, CSV, and many other types of files.
 
# Reading An Excel File
# Reading A CSV File
# Reading tab-delimited file
# Reading A JSON File
# Uploading Local Files In Jupyter / Google Colab

In [77]:
# Reading excel file
# read() method is used to read the excel file
data = pd.read_excel("https://raw.githubusercontent.com/yash240990/Python/master/sampleExcelData.xlsx")
data

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Merle,Draisey,mdraisey0@squarespace.com,Bigender,20.70.199.51
1,2,Casey,Sheach,csheach1@deviantart.com,Male,226.126.229.167
2,3,Darya,Leivers,dleivers2@imgur.com,Female,247.209.88.235
3,4,Fancie,Casebourne,fcasebourne3@g.co,Female,172.34.162.239
4,5,Jenilee,Vsanelli,jvsanelli4@fc2.com,Female,200.104.246.2
...,...,...,...,...,...,...
995,996,Lutero,Yurov,lyurovrn@un.org,Male,95.146.16.187
996,997,Verla,Peery,vpeeryro@com.com,Female,172.90.253.234
997,998,Fran,Langran,flangranrp@ft.com,Male,123.179.99.154
998,999,Gabbi,Cummins,gcumminsrq@jugem.jp,Female,190.13.174.159


In [78]:
# read json file
data = pd.read_json("https://raw.githubusercontent.com/yash240990/Python/master/simple.json")
data

Unnamed: 0,id,name,math,physics,chemistry
0,A001,Tom,60,66,61
1,A002,James,89,76,51
2,A003,Jenny,79,90,78


In [79]:
#Read the Excel file using the URL
excelFile = pd.ExcelFile("https://raw.githubusercontent.com/yash240990/Python/master/sampleExcelData.xlsx")
excelFile.sheet_names

['data', 'Sheet2']

In [80]:
data = pd.read_excel(excelFile, sheet_name='Sheet2')
data

Unnamed: 0,first_name,last_name,email
0,Merle,Draisey,mdraisey0@squarespace.com
1,Casey,Sheach,csheach1@deviantart.com
2,Darya,Leivers,dleivers2@imgur.com
3,Fancie,Casebourne,fcasebourne3@g.co
4,Jenilee,Vsanelli,jvsanelli4@fc2.com
5,Charmian,Drinkhall,cdrinkhall5@theglobeandmail.com
6,Janessa,Kivell,jkivell6@linkedin.com
7,Mirna,Heinsh,mheinsh7@spotify.com
8,Darby,Lax,dlax8@people.com.cn
9,Meryl,Riehm,mriehm9@princeton.edu


In [81]:
# Read CSV file 

data = pd.read_csv("https://raw.githubusercontent.com/yash240990/Python/master/Position_Salaries.csv")
data

Unnamed: 0,Position,Level,Salary
0,Business Analyst,1,45000
1,Junior Consultant,2,50000
2,Senior Consultant,3,60000
3,Manager,4,80000
4,Country Manager,5,110000
5,Region Manager,6,150000
6,Partner,7,200000
7,Senior Partner,8,300000
8,C-level,9,500000
9,CEO,10,1000000


In [82]:
# Upload Local File

data = pd.read_csv("sampleData.csv")
data

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Gnni,Mourton,gmourton0@amazon.com,Female,47.124.205.97
1,2,Lucio,Paquet,lpaquet1@gov.uk,Male,98.92.52.144
2,3,Renell,Tregust,rtregust2@pinterest.com,Genderfluid,139.140.13.123
3,4,Ranice,Playhill,rplayhill3@ovh.net,Female,219.95.73.247
4,5,Nicolle,Altamirano,naltamirano4@i2i.jp,Female,73.78.28.203
...,...,...,...,...,...,...
995,996,Aleta,Kikke,akikkern@rakuten.co.jp,Female,37.200.29.239
996,997,Dick,Torregiani,dtorregianiro@irs.gov,Male,139.211.118.62
997,998,Alisander,Gilluley,agilluleyrp@virginia.edu,Male,246.68.125.194
998,999,Irwin,Forrington,iforringtonrq@friendfeed.com,Genderqueer,156.23.105.225
