# Beginners data analysis with Pandas

In the first step, we import necessary libraries that are required for this example. Here, we use pandas library.
Pandas is a fast, powerful, flexible and easy-to-use open source data analysis and manipulation tool built on top of the Python programming language.
To install pandas, please go to this link: https://pandas.pydata.org/getting_started.html

In [1]:
import pandas as pd   #Load pandas into pd.
# Creating a Data Frame 
data = {'name': ['Leo', 'Leo', 'Leo', 'Messi', 'Di Caprio'], 'year': [2013, 2014, 2015, 2016, 2015], 'points': [1.5, 1.7, 3.6, 2.4, 2.9]} 
df = pd.DataFrame(data) # Data with a structure of rows and columns is created.
print(df) 
display(df)

        name  year  points
0        Leo  2013     1.5
1        Leo  2014     1.7
2        Leo  2015     3.6
3      Messi  2016     2.4
4  Di Caprio  2015     2.9


Unnamed: 0,name,year,points
0,Leo,2013,1.5
1,Leo,2014,1.7
2,Leo,2015,3.6
3,Messi,2016,2.4
4,Di Caprio,2015,2.9


In [2]:
# Printing the row direction index
print(df.index) 

RangeIndex(start=0, stop=5, step=1)


In [3]:
#Printing the column in the column direction
print(df.columns)  

Index(['name', 'year', 'points'], dtype='object')


In [4]:
# Get value of the whole dataframe
print(df.values) 

[['Leo' 2013 1.5]
 ['Leo' 2014 1.7]
 ['Leo' 2015 3.6]
 ['Messi' 2016 2.4]
 ['Di Caprio' 2015 2.9]]


In [5]:
# Setting names for each index and column
df.index.name = 'Num' 
df.columns.name = 'Info' 
print(df)

Info       name  year  points
Num                          
0           Leo  2013     1.5
1           Leo  2014     1.7
2           Leo  2015     3.6
3         Messi  2016     2.4
4     Di Caprio  2015     2.9


In [6]:
#Creating another dataframe df2 with four columns. A penalty column is added from the previous dataframe.
df2 = pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty'], 
index=['one', 'two', 'three', 'four', 'five']) 
print(df2) 

       year       name  points penalty
one    2013        Leo     1.5     NaN
two    2014        Leo     1.7     NaN
three  2015        Leo     3.6     NaN
four   2016      Messi     2.4     NaN
five   2015  Di Caprio     2.9     NaN


In [7]:
# describe(): The function shows various calculated values for the data frame's computable values.
print(df2.describe()) 

              year    points
count     5.000000  5.000000
mean   2014.600000  2.420000
std       1.140175  0.864292
min    2013.000000  1.500000
25%    2014.000000  1.700000
50%    2015.000000  2.400000
75%    2015.000000  2.900000
max    2016.000000  3.600000


In [8]:
# Set the data and set the index and column accordingly.
data = {"names": ["Jack", "Jack", "Jack", "Charles", "Charles"], 
"year": [2014, 2015, 2016, 2015, 2016], 
"points": [1.5, 1.7, 3.6, 2.4, 2.9]} 

#Create a new data frame with the data created above.
df = pd.DataFrame(data, columns=["year", "names", "points", "penalty"],
index=["one", "two", "three", "four", "five"]) 
print(df) 

       year    names  points penalty
one    2014     Jack     1.5     NaN
two    2015     Jack     1.7     NaN
three  2016     Jack     3.6     NaN
four   2015  Charles     2.4     NaN
five   2016  Charles     2.9     NaN


In [9]:
print(df['year']) # You can know the attribute values of the column by the column name.
print(df.year) # Different ways of having the same meaning
print(df[['year','points']]) # You can access multiple columns. 

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64
       year  points
one    2014     1.5
two    2015     1.7
three  2016     3.6
four   2015     2.4
five   2016     2.9


In [10]:
df['penalty'] = 0.5 # We can select as above for a specific column and assign the value we want. 
print(df) 

       year    names  points  penalty
one    2014     Jack     1.5      0.5
two    2015     Jack     1.7      0.5
three  2016     Jack     3.6      0.5
four   2015  Charles     2.4      0.5
five   2016  Charles     2.9      0.5


In [11]:
# List of python or array of numpy
df['penalty'] = [0.1, 0.2, 0.3, 0.4, 0.5] 
print(df) 

       year    names  points  penalty
one    2014     Jack     1.5      0.1
two    2015     Jack     1.7      0.2
three  2016     Jack     3.6      0.3
four   2015  Charles     2.4      0.4
five   2016  Charles     2.9      0.5


In [12]:
import numpy as np
#adding another column named 'zeros' in the data frame
df['zeros'] = np.arange(5)
print(df)

       year    names  points  penalty  zeros
one    2014     Jack     1.5      0.1      0
two    2015     Jack     1.7      0.2      1
three  2016     Jack     3.6      0.3      2
four   2015  Charles     2.4      0.4      3
five   2016  Charles     2.9      0.5      4


In [13]:
# Series can also be added.
val = pd.Series([-1.2, -1.5, -1.7], index=['two','four','five']) 
df['debt'] = val 
print(df) 

       year    names  points  penalty  zeros  debt
one    2014     Jack     1.5      0.1      0   NaN
two    2015     Jack     1.7      0.2      1  -1.2
three  2016     Jack     3.6      0.3      2   NaN
four   2015  Charles     2.4      0.4      3  -1.5
five   2016  Charles     2.9      0.5      4  -1.7


In [14]:
# New values can be created from existing values.
df['net_points'] = df['points'] - df['penalty'] 
df['high_points'] = df['net_points'] > 2.0 
print(df) 

       year    names  points  penalty  zeros  debt  net_points  high_points
one    2014     Jack     1.5      0.1      0   NaN         1.4        False
two    2015     Jack     1.7      0.2      1  -1.2         1.5        False
three  2016     Jack     3.6      0.3      2   NaN         3.3         True
four   2015  Charles     2.4      0.4      3  -1.5         2.0        False
five   2016  Charles     2.9      0.5      4  -1.7         2.4         True


In [15]:
# Delete a column
del df['high_points'] 
del df['net_points'] 
del df['zeros'] 
print(df) 

       year    names  points  penalty  debt
one    2014     Jack     1.5      0.1   NaN
two    2015     Jack     1.7      0.2  -1.2
three  2016     Jack     3.6      0.3   NaN
four   2015  Charles     2.4      0.4  -1.5
five   2016  Charles     2.9      0.5  -1.7


In [16]:
df.index.name = 'Order' 
df.columns.name = 'Info' 
print(df) 

Info   year    names  points  penalty  debt
Order                                      
one    2014     Jack     1.5      0.1   NaN
two    2015     Jack     1.7      0.2  -1.2
three  2016     Jack     3.6      0.3   NaN
four   2015  Charles     2.4      0.4  -1.5
five   2016  Charles     2.9      0.5  -1.7


In [17]:
# indexing 
# How to use the .loc or .iloc function.
print(df.loc['two']) 

Info
year       2015
names      Jack
points      1.7
penalty     0.2
debt       -1.2
Name: two, dtype: object


In [18]:
print(df.loc['two':'four', 'points']) 
# Based on , the front means the row after the column.

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64


In [19]:
print(df.loc[:,'year']) # == df['year'] 

Order
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64


In [20]:
print(df.loc[:,['year','names']]) 

Info   year    names
Order               
one    2014     Jack
two    2015     Jack
three  2016     Jack
four   2015  Charles
five   2016  Charles


In [21]:
print(df.loc['three':'five','year':'penalty'])

Info   year    names  points  penalty
Order                                
three  2016     Jack     3.6      0.3
four   2015  Charles     2.4      0.4
five   2016  Charles     2.9      0.5


In [22]:
# Inserting a new row
df.loc['six',:] = [2013,'Jun',4.0,0.1,2.1] 
print(df) 

Info     year    names  points  penalty  debt
Order                                        
one    2014.0     Jack     1.5      0.1   NaN
two    2015.0     Jack     1.7      0.2  -1.2
three  2016.0     Jack     3.6      0.3   NaN
four   2015.0  Charles     2.4      0.4  -1.5
five   2016.0  Charles     2.9      0.5  -1.7
six    2013.0      Jun     4.0      0.1   2.1


In [23]:
# Using .iloc: Use the index number.
print(df.iloc[3]) 

Info
year          2015
names      Charles
points         2.4
penalty        0.4
debt          -1.5
Name: four, dtype: object


In [24]:
# Get the 3rd row.
# Rows 3-5, columns 0-2
print(df.iloc[3:5, 0:2]) 
print(df.iloc[:,1:4]) 

Info     year    names
Order                 
four   2015.0  Charles
five   2016.0  Charles
Info     names  points  penalty
Order                          
one       Jack     1.5      0.1
two       Jack     1.7      0.2
three     Jack     3.6      0.3
four   Charles     2.4      0.4
five   Charles     2.9      0.5
six        Jun     4.0      0.1
