# [Pandas](https://pandas.pydata.org/) 
is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

In [1]:
import pandas as pd
import numpy as np

|DATA STRUCTURE|DIMENSIONALITY|SPREADSHEET ANALOG|
|---|---|---|---|
|Series|1D|Column|
|DataFrame|2D|Single Sheet|
|Panel|3D|Multiple Sheets|

In [2]:
s = pd.Series(np.arange(10)) # generate a pandas series
s.describe()

count    10.00000
mean      4.50000
std       3.02765
min       0.00000
25%       2.25000
50%       4.50000
75%       6.75000
max       9.00000
dtype: float64

In [3]:
df = pd.DataFrame(np.arange(15).reshape(5,3),columns=list('ABC'))
print(df)

    A   B   C
0   0   1   2
1   3   4   5
2   6   7   8
3   9  10  11
4  12  13  14


In [4]:
#Create a DataFrame from a list of tuples
data = [ ('p1', 't1', 1, 2), ('p1', 't2', 3, 4), ('p2', 't1', 5, 6), ('p2', 't2', 7, 8), ('p2', 't3', 2, 8) ]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2,3
0,p1,t1,1,2
1,p1,t2,3,4
2,p2,t1,5,6
3,p2,t2,7,8
4,p2,t3,2,8


In [5]:
#Get Data Types of Columns
df.dtypes

0    object
1    object
2     int64
3     int64
dtype: object

In [6]:
#Generate a DataFrame with Columns A,B,C, Index of rows = 0,1,2,….4
df = pd.DataFrame ({ 'A': [1, 2, 1, 4, 3],
                     'B': [12, 14, 11, 16, 18],
                     'C': ['a', 'a', 'b', 'a', 'b'] } )
df

Unnamed: 0,A,B,C
0,1,12,a
1,2,14,a
2,1,11,b
3,4,16,a
4,3,18,b


In [7]:
df.describe()
#since C is not a numerical column, it is excluded from the output.

Unnamed: 0,A,B
count,5.0,5.0
mean,2.2,14.2
std,1.30384,2.863564
min,1.0,11.0
25%,1.0,12.0
50%,2.0,14.0
75%,3.0,16.0
max,4.0,18.0


In [8]:
# “.describe” summarizes categorical data by:
# number of observations, number of unique elements, mode, and frequency of the mode.
df['C'].describe()

count     5
unique    2
top       a
freq      3
Name: C, dtype: object

# Appending a new row to DataFrame

In [9]:
df = pd.DataFrame(columns = ['A', 'B', 'C']) # Generate Empty Data Frame
df

Unnamed: 0,A,B,C


In [10]:
df.loc[0, 'A'] = 1 # Appending a row by a single column value:
df

Unnamed: 0,A,B,C
0,1,,


In [11]:
df.loc[1] = [2, 3, 4] #Appending a row, given list of values:
df

Unnamed: 0,A,B,C
0,1,,
1,2,3.0,4.0


In [12]:
df.loc[2] = {'A': 3, 'C': 9, 'B': 9} #Appending a row given as a dictionary (unordered)
df

Unnamed: 0,A,B,C
0,1,,
1,2,3.0,4.0
2,3,9.0,9.0


In [13]:
df.loc['1'] = [5, 6, 7] # use an existing index, you will overwrite the values in that row
df

Unnamed: 0,A,B,C
0,1,,
1,2,3.0,4.0
2,3,9.0,9.0
1,5,6.0,7.0


# Appending DataFrames

In [15]:
df_1 = pd.DataFrame({'A':['a1','a2'] , 'B':['b1','b2']})
df_2 = pd.DataFrame({'B':['b1'] , 'C':['c1']})
df_3 = df_1.append(df_2)
df_3

Unnamed: 0,A,B,C
0,a1,b1,
1,a2,b2,
0,,b1,c1


In [16]:
df_4 = df_1.append(df_2, ignore_index = True)
df_4

Unnamed: 0,A,B,C
0,a1,b1,
1,a2,b2,
2,,b1,c1


# Using .iloc
The iloc indexer for Pandas Dataframe is used for integer-location based
indexing / selection by position

In [22]:
df = pd.DataFrame ({ 'First_Name': ["Ahmed", "Hassan", "Maged", "Omar", "Amr"],
                     'Last_Name': ["Osam", "Khaled", "Sayed", "Mohamed", "Ahmed"],
                     'Age': [20, 30, 27, 32, 46],
                     'City': ['Cairo', 'Mansoura', 'Alex', 'Benha', 'Cairo'],
                     'Company': ['IBM', 'DXC', 'Orange', 'DXC', 'Orange']})
df

Unnamed: 0,Age,City,Company,First_Name,Last_Name
0,20,Cairo,IBM,Ahmed,Osam
1,30,Mansoura,DXC,Hassan,Khaled
2,27,Alex,Orange,Maged,Sayed
3,32,Benha,DXC,Omar,Mohamed
4,46,Cairo,Orange,Amr,Ahmed


In [23]:
df.iloc[:,-1] # Last column of data frame 

0       Osam
1     Khaled
2      Sayed
3    Mohamed
4      Ahmed
Name: Last_Name, dtype: object

# Multiple row and column selections using iloc and DataFrame

In [28]:
df.iloc[[0,3,4], [0,2,4]]

Unnamed: 0,Age,Company,Last_Name
0,20,IBM,Osam
3,32,DXC,Mohamed
4,46,Orange,Ahmed


In [30]:
df.iloc[0:2, 2:4]

Unnamed: 0,Company,First_Name
0,IBM,Ahmed
1,DXC,Hassan


# .iloc returns:
- a Pandas Series when one row is selected,
- a Pandas DataFrame when multiple rows are selected
- a Pandas DataFrame if any column in full is selected.
- a Pandas DataFrame if pass a single-valued list.

In [38]:
df.iloc[2]

Age               27
City            Alex
Company       Orange
First_Name     Maged
Last_Name      Sayed
Name: 2, dtype: object

In [37]:
df.iloc[[2]]

Unnamed: 0,Age,City,Company,First_Name,Last_Name
2,27,Alex,Orange,Maged,Sayed


In [31]:
df.iloc[0:2, "First_Name"]

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

# Using .loc
The Pandas loc indexer can be used with DataFrames for two different use cases:
- Selecting rows by label/index
- Selecting rows with a boolean / conditional lookup

In [34]:
df.loc[0:2, "First_Name"]

0     Ahmed
1    Hassan
2     Maged
Name: First_Name, dtype: object

In [35]:
df.loc[0:2, 1]

TypeError: cannot do label indexing on <class 'pandas.indexes.base.Index'> with these indexers [1] of <class 'int'>

# Save and Load DataFrames

#### Save dataframe to pickled pandas object (Special Format for PANDAS)
df.to_pickle(file_name) 

#### Load dataframe from pickled pandas object (Special Format for PANDAS)
df= pd.read_pickle(file_name) 

#### load dataframe from Text File
df=pd.read_csv(file_name)

#### save dataframe to Text File
df.to_csv(file_name)

#####  define Separator ‘;’
pd.read_csv('data_file.csv', sep=';') 

##### define Col# for Index
pd.read_csv('data_file.csv', index_col=0)