In [2]:
pip install -U ydata-profiling




In [3]:
import matplotlib 
import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np


In [4]:
df = pd.read_csv('Football.csv')
df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


# A Working with series

### - Creating series

In [7]:
#pd.__version__
lst = [1,2,3,4,5,6]   # series through list
pd.Series(lst)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [8]:
# series through numpy array
arr = np.array([1,2,3,4,5,6])
pd.Series(arr)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int32

In [9]:
# Give index from our side
#pd.Series(data = ['Eshant','Pranjal', 'Ashish','Manish'], index = [1,2,3,4])

In [10]:
pd.Series(index = ['Eshant','Pranjal', 'Ashish','Manish'], data = [1,2,3,4])

Eshant     1
Pranjal    2
Ashish     3
Manish     4
dtype: int64

In [11]:
# Series with dictionary
steps = {'day1' : 4000, 'day2' : 3000, 'day3' : 2000}
pd.Series(steps)

day1    4000
day2    3000
day3    2000
dtype: int64

In [12]:
# Using a repeat function in series and index would be same
pd.Series(5).repeat(3)

0    5
0    5
0    5
dtype: int64

In [13]:
# to make the index accurate
pd.Series(5).repeat(3).reset_index(drop = True)
#type(pd.Series(5).repeat(3).reset_index())

0    5
1    5
2    5
dtype: int64

In [14]:
# 10 should be repeated 5 times and 20 should be repeated 2 times
s = pd.Series([10,20]).repeat([5,2]).reset_index(drop = True)#(drop=True) removes the existing index and resets it to the default integer index.
s

0    10
1    10
2    10
3    10
4    10
5    20
6    20
dtype: int64

### 1 Accessing elements

In [16]:
print(s[1])
print(s[6])

10
20


In [17]:
s[2:-2]   #By last n numbers (start - end-1) by -2 skipping

2    10
3    10
4    10
dtype: int64

### 2 Aggregate function on pandas Series

In [19]:
#Pandas Series.aggregate() function aggregate using one or more operations over the specified axis in the given series object.


In [20]:
sr = pd.Series([1,2,3,4,5,6,7])
#sr.agg([min,max,sum])
print(sr.sum())
print(sr.min())
print(sr.max())

28
1
7


### 3 Series absolute function

In [22]:
sr = pd.Series([1,-2,3,-4,5,-6,7]) #Pandas Series.abs() method is used to get the absolute numeric value of each element in Series/DataFrame

sr.abs()

0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int64

### 4 Appending series

In [24]:
sr1 = pd.Series([1,-2,3])
sr2 = pd.Series([1,2,3])

In [25]:
sr3 = pd.concat([sr1,sr2])
#sr3 = pd.concat([sr2,sr1])
sr3.reset_index(drop = True) # without (drop = True) it will produce dataframe

0    1
1   -2
2    3
3    1
4    2
5    3
dtype: int64

### 5 Astype function

Pandas astype() is the one of the most important methods. It is used to change data type of a series. When data frame is made from a csv file, the columns are imported and data type is set automatically which many times is not what it actually should have.

In [28]:
print(sr1)
print(type(sr1))
print(type(sr1[0]))
print(sr1.astype('float'))

0    1
1   -2
2    3
dtype: int64
<class 'pandas.core.series.Series'>
<class 'numpy.int64'>
0    1.0
1   -2.0
2    3.0
dtype: float64


### 6 Between function

Pandas between() method is used on series to check which values lie between first and second argument

In [31]:
sr1 = pd.Series([1,2,30,4,5,6,7,8,9,20])
sr1.reset_index(drop= True)

0     1
1     2
2    30
3     4
4     5
5     6
6     7
7     8
8     9
9    20
dtype: int64

In [32]:
sr1.between(10,50)

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
8    False
9     True
dtype: bool

### 7 All strings functions can be used to extract or modify texts in a series
##### Upper and Lower Function
##### Len function
##### Strip Function
##### Split Function
##### Contains Function
##### Replace Function
##### Count Function
##### Startswith and Endswith Function
##### Find Finction

In [35]:
ser = pd.Series(["Eshant Das" , "Data Science" , "Geeks for Geeks" , 'Hello World' , 'Machine Learning'])

In [36]:
#Upper and Lower Function
print(ser.str.upper())
print('-'*30)
print(ser.str.lower())

0          ESHANT DAS
1        DATA SCIENCE
2     GEEKS FOR GEEKS
3         HELLO WORLD
4    MACHINE LEARNING
dtype: object
------------------------------
0          eshant das
1        data science
2     geeks for geeks
3         hello world
4    machine learning
dtype: object


In [37]:
# Length  function
for i in ser:
    print(len(i))

10
12
15
11
16


In [38]:
# Strip function
ser = pd.Series(["  Eshant Das" , "Data Science" , "Geeks for Geeks" , 'Hello World' , 'Machine Learning  '])

for i in ser:
    print(i , len(i))

  Eshant Das 12
Data Science 12
Geeks for Geeks 15
Hello World 11
Machine Learning   18


2 extra spaces has been removed|

In [40]:
ser = ser.str.strip()

for i in ser:
    print(i , len(i))

Eshant Das 10
Data Science 12
Geeks for Geeks 15
Hello World 11
Machine Learning 16


In [41]:
# Split function
print(ser.str.split())
print(ser.str.split()[0])
print(ser.str.split()[1])

0          [Eshant, Das]
1        [Data, Science]
2    [Geeks, for, Geeks]
3         [Hello, World]
4    [Machine, Learning]
dtype: object
['Eshant', 'Das']
['Data', 'Science']


In [42]:
print(pd.Series(['10/2/2981','10/3/2981','10/4/2981']).str.split('/'))
print(pd.Series(['10/2/2981','10/3/2981','10/4/2981']).str.split('/')[0])
print(pd.Series(['10/2/2981','10/3/2981','10/4/2981']).str.split('/')[1])
print(pd.Series(['10/2/2981','10/3/2981','10/4/2981']).str.split('/')[2])


0    [10, 2, 2981]
1    [10, 3, 2981]
2    [10, 4, 2981]
dtype: object
['10', '2', '2981']
['10', '3', '2981']
['10', '4', '2981']


In [43]:
# Contains function
ser = pd.Series(["Eshant Das","Data@Science","Geeks for Geeks",'Hello@World','Machine Learning'])

ser.str.contains('@')

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [44]:
#Replace function
ser.str.replace('@',' ')

0          Eshant Das
1        Data Science
2     Geeks for Geeks
3         Hello World
4    Machine Learning
dtype: object

In [45]:
#Count function
print(ser.str.count('a'))
print(ser.str.count('Eshant Das'))

0    2
1    2
2    0
3    0
4    2
dtype: int64
0    1
1    0
2    0
3    0
4    0
dtype: int64


In [46]:
# Startswith
ser.str.startswith('D')

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [47]:
# Endswith
ser.str.endswith('s')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [48]:
# Find function
ser = pd.Series(["Eshant Das","Data@Science","Geeks for Geeks",'Hello@World','Machine Learning'])
ser.str.find('Geeks')

0   -1
1   -1
2    0
3   -1
4   -1
dtype: int64

### 8 Converting a series to list

In [50]:
#Pandas tolist() is used to convert a series to list. Initially the series is of type pandas.core.series.
ser.to_list()

['Eshant Das',
 'Data@Science',
 'Geeks for Geeks',
 'Hello@World',
 'Machine Learning']

# B Data Frames

### 1 Creating Data frames

In the real world, a Pandas DataFrame will be created by loading the datasets from existing storage, storage can be SQL Database, CSV file, and Excel file. Pandas DataFrame can be created from the lists, dictionary, and from a list of dictionary etc. Dataframe can be created in different ways here are some ways by which we create a dataframe:

##### from list

In [55]:
#DataFrame can be created using a single list or a list of lists
lst = ['Geeks', 'For', 'Geeks', 'is', 'portal', 'for', 'Geeks']

pd.DataFrame(lst)

Unnamed: 0,0
0,Geeks
1,For
2,Geeks
3,is
4,portal
5,for
6,Geeks


In [56]:
print(pd.DataFrame(lst))
type(pd.DataFrame(lst))

        0
0   Geeks
1     For
2   Geeks
3      is
4  portal
5     for
6   Geeks


pandas.core.frame.DataFrame

##### From multi-dimension list

In [58]:
lst = [['tom',10],['jerry',12],['spike',14]]
pd.DataFrame(lst)

Unnamed: 0,0,1
0,tom,10
1,jerry,12
2,spike,14


To create DataFrame from dict of narray/list, all the narray must be of same length. If index is passed then the length index should be equal to the length of arrays. If no index is passed, then by default, index will be range(n) where n is the array length.

In [70]:
data = {'name':['Tom', 'nick', 'krish', 'jack'], 'age':[20, 21, 19, 18]}
print(pd.DataFrame(data))

    name  age
0    Tom   20
1   nick   21
2  krish   19
3   jack   18


##### From multiple dictionary values creating data frames

In [85]:
data = { 'Name'         :['Jai', 'Princi', 'Gaurav', 'Anuj'],
         'Age'          :[27, 24, 22, 32],
         'Address'      :['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
         'Qualification':['Msc', 'MA', 'MCA', 'Phd']}
df = pd.DataFrame(data)
print(df)
print(df[['Name','Age']])

     Name  Age    Address Qualification
0     Jai   27      Delhi           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannauj           Phd
     Name  Age
0     Jai   27
1  Princi   24
2  Gaurav   22
3    Anuj   32


### 2 Slicing in df using iloc and loc

###### Pandas comprises many methods for its proper functioning. loc() and iloc() are one of those methods. These are used in slicing data from the Pandas DataFrame. They help in the convenient selection of data from the DataFrame in Python. They are used in filtering the data according to some conditions.



In [102]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


#### Basic loc function
##### The loc() function is label based data selecting method which means that we have to pass the name of the row or column which we want to select. This method includes the last element of the range passed in it, unlike iloc(). loc() can accept the boolean data unlike iloc(). Many operations can be performed using the loc() method like

In [107]:
print(df.loc[0:])
print('*'*25)
print(df.loc[1:3])
print('*'*25)
print(df.loc[:2])
print('*'*25)
print(df.loc[1:2, 'two' : 'three']) # selecting specific columns

   one  two  three  four
0    1   10    100  1000
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000
*************************
   one  two  three  four
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000
*************************
   one  two  three  four
0    1   10    100  1000
1    2   20    200  2000
2    3   30    300  3000
*************************
   two  three
1   20    200
2   30    300


#### Basic iloc function
##### The iloc() function is an indexed-based selecting method which means that we have to pass an integer index in the method to select a specific row/column. This method does not include the last element of the range passed in it unlike loc(). iloc() does not accept the boolean data unlike loc().

In [110]:
df.iloc[1 : -1, 1:-1 ] # removing first and last

Unnamed: 0,two,three
1,20,200
2,30,300


In [112]:
# you can see index 3 of both row and column has not been added here so 1 was inclusize but 3 is exclusive in the case of ilocs
# another example
print(df.iloc[:,2:3])
print(df.iloc[:,:])

   three
0    100
1    200
2    300
3    400
   one  two  three  four
0    1   10    100  1000
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000


In [114]:
# select specific rows
df.iloc[[0,2],[1,3]]

Unnamed: 0,two,four
0,10,1000
2,30,3000


#### slicing based on conditions

In [117]:
#So we could extract only those data for which the value is more than 20
#For the columns we have used comma(,) to extract specifc columns which is 'three' and 'four'

In [119]:
df.loc[df['two'] > 20, ['three','four']] # Using Conditions works with loc basically

Unnamed: 0,three,four
2,300,3000
3,400,4000


In [121]:
df.loc[df['three'] < 300, ['one','four']]

Unnamed: 0,one,four
0,1,1000
1,2,2000


### 3 Column addition

In [124]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


We can add a column in many ways. Let us discuss three ways how we can add column here

- Using List 
- Using Pandas Series
- Using an existing Column (we can modify that column in the way we want and that modified part can also be displayed)

In [127]:
l = [22,33,44,55]   # using list
df['five'] = l
df

Unnamed: 0,one,two,three,four,five
0,1,10,100,1000,22
1,2,20,200,2000,33
2,3,30,300,3000,44
3,4,40,400,4000,55


In [129]:
sr = pd.Series([11,222,333,222])  # using pandas series
df['six'] = sr
df

Unnamed: 0,one,two,three,four,five,six
0,1,10,100,1000,22,11
1,2,20,200,2000,33,222
2,3,30,300,3000,44,333
3,4,40,400,4000,55,222


In [131]:
df['seven'] = df['one'] * 20
df

Unnamed: 0,one,two,three,four,five,six,seven
0,1,10,100,1000,22,11,20
1,2,20,200,2000,33,222,40
2,3,30,300,3000,44,333,60
3,4,40,400,4000,55,222,80


### 4 Column deletion

In [134]:
# using del
del df['six']
df

Unnamed: 0,one,two,three,four,five,seven
0,1,10,100,1000,22,20
1,2,20,200,2000,33,40
2,3,30,300,3000,44,60
3,4,40,400,4000,55,80


In [136]:
# using pop
df.pop('five')

df

Unnamed: 0,one,two,three,four,seven
0,1,10,100,1000,20
1,2,20,200,2000,40
2,3,30,300,3000,60
3,4,40,400,4000,80


### 5 Adding of rows

##### In a Pandas DataFrame, you can add rows by using the concat method. You can also create a new DataFrame with the desired row values and use the concat to add the new row to the original dataframe. Here's an example of adding a single row to a dataframe:

In [140]:
df1 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])

In [147]:
df3 = pd.concat([df1,df2], ignore_index=True)

df3

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6
3,7,8


### 6 Pandas drop function

Python is a great language for doing data analysis, primarily because of the fantastic ecosystem of data-centric Python packages. Pandas is one of those packages and makes importing and analyzing data much easier.

Pandas provide data analysts a way to delete and filter data frame using .drop() method. Rows or columns can be removed using index label or column name using this method.

Syntax: DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors=’raise’)

Parameters:

labels: String or list of strings referring row or column name. axis: int or string value, 0 ‘index’ for Rows and 1 ‘columns’ for Columns. index or columns: Single label or list. index or columns are an alternative to axis and cannot be used together. level: Used to specify level in case data frame is having multiple level index. inplace: Makes changes in original Data Frame if True. errors: Ignores error if any value from the list doesn’t exists and drops rest of the values when errors = ‘ignore’

Return type: Dataframe with dropped values

In [181]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}


df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [183]:
df.drop([0,1], axis = 0, inplace = True) #axis =0 => Rows (row wise) and inplace = True used for saving the dataframe
df

Unnamed: 0,one,two,three,four
2,3,30,300,3000
3,4,40,400,4000


In [185]:
df.drop(['one','three'], axis = 1, inplace = True) #axis =1 => Columns (column wise) and inplace = True used for saving the dataframe
df

Unnamed: 0,two,four
2,30,3000
3,40,4000


### 7 Transpose of a dataframe

In [188]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [191]:
df.T

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400
four,1000,2000,3000,4000


### 8 DataFrame different functionalities

- **Axes function**
- The .axes attribute in a Pandas DataFrame returns a list with the row and column labels of the DataFrame. The first element of the list is the row labels (index), and the second element is the column labels.

In [219]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [221]:
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['one', 'two', 'three', 'four'], dtype='object')]

- **Ndim function**
- The .ndim attribute in a Pandas DataFrame returns the number of dimensions of the dataframe, which is always 2 for a DataFrame (row-and-column format).



In [224]:
df.ndim

2

- **dtypes function**
- The .dtypes attribute in a Pandas DataFrame returns the data types of the columns in the DataFrame. The result is a Series with the column names as index and the data types of the columns as values.

In [238]:
df.dtypes

one      int64
two      int64
three    int64
four     int64
dtype: object

- **shape function**
- The .shape attribute in a Pandas DataFrame returns the dimensions (number of rows, number of columns) of the DataFrame as a tuple.

In [242]:
df.shape

(4, 4)

- **head() function**
- The .head() method in a Pandas DataFrame returns the first n rows (by default, n=5) of the DataFrame. This method is useful for quickly examining the first few rows of a large DataFrame to get a sense of its structure and content.

In [263]:
d = { 'Name'  :pd.Series(['Tom','Jerry','Spike','Popeye','Olive','Bluto','Mickey']),
      'Age'   :pd.Series([10,12,14,30,28,33,15]),
      'Height':pd.Series([3.25,1.11,4.12,5.47,6.15,6.67,2.61])}

df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Height
0,Tom,10,3.25
1,Jerry,12,1.11
2,Spike,14,4.12
3,Popeye,30,5.47
4,Olive,28,6.15
5,Bluto,33,6.67
6,Mickey,15,2.61


In [265]:
print(df.head())  # head function is used for 5 values
print(df.head(3))

     Name  Age  Height
0     Tom   10    3.25
1   Jerry   12    1.11
2   Spike   14    4.12
3  Popeye   30    5.47
4   Olive   28    6.15
    Name  Age  Height
0    Tom   10    3.25
1  Jerry   12    1.11
2  Spike   14    4.12


- **tail function**
- The .tail() method in a Pandas DataFrame returns the last n rows (by default, n=5) of the DataFrame. This method is useful for quickly examining the last few rows of a large DataFrame to get a sense of its structure and content.

In [268]:
print(df.tail())
print(df.empty)

     Name  Age  Height
2   Spike   14    4.12
3  Popeye   30    5.47
4   Olive   28    6.15
5   Bluto   33    6.67
6  Mickey   15    2.61
False


- **Empty function**
- The .empty attribute in a Pandas DataFrame returns a Boolean value indicating whether the DataFrame is empty or not. A DataFrame is considered empty if it has no rows.

In [271]:
df = pd.DataFrame()

df.empty

True

### 9 Statistical or mathematical function

Sum
    $\;\;\;\;\;\;$  $\;\;\;\;\;\;$ Mean
    $\;\;\;\;\;\;$  $\;\;\;\;\;\;$ Median
    $\;\;\;\;\;\;$  $\;\;\;\;\;\;$ Mode
    $\;\;\;\;\;\;$  $\;\;\;\;\;\;$ Variance
    $\;\;\;\;\;\;$  $\;\;\;\;\;\;$ Min
    $\;\;\;\;\;\;$  $\;\;\;\;\;\;$ Max
    $\;\;\;\;\;\;$  $\;\;\;\;\;\;$ Standard Deviation

In [275]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


- **Sum**

In [278]:
df.sum()

one         10
two        100
three     1000
four     10000
dtype: int64

In [282]:
df.sum(axis = 1)  # column wise

0    1111
1    2222
2    3333
3    4444
dtype: int64

- **Mean**

In [285]:
df.mean()

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64

In [287]:
df.mean(axis =  1)

0     277.75
1     555.50
2     833.25
3    1111.00
dtype: float64

- **Median**

In [290]:
df.median()

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64

- **Mode**

In [314]:
de = pd.DataFrame({'A': [1, 2, 2, 3, 4, 4, 4, 5], 'B': [10, 20, 20, 30, 40, 40, 50, 60]})
print(de)
print('A' , de['A'].mode())
print('B' , de['B'].mode())

   A   B
0  1  10
1  2  20
2  2  20
3  3  30
4  4  40
5  4  40
6  4  50
7  5  60
A 0    4
Name: A, dtype: int64
B 0    20
1    40
Name: B, dtype: int64


- **Variance**

In [317]:
df.var()

one      1.666667e+00
two      1.666667e+02
three    1.666667e+04
four     1.666667e+06
dtype: float64

- **Min**

In [323]:
print(df)
print(df.min(axis = 0))
print(df.min(axis = 1))

   one  two  three  four
0    1   10    100  1000
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000
one         1
two        10
three     100
four     1000
dtype: int64
0    1
1    2
2    3
3    4
dtype: int64


- **Max**

In [332]:
print(df)
print(df.max(axis = 0))  #column wise  HERE IT IS OPPOSITE USE OF AXIS = 0,1
print(df.max(axis = 1))  # row wise

   one  two  three  four
0    1   10    100  1000
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000
one         4
two        40
three     400
four     4000
dtype: int64
0    1000
1    2000
2    3000
3    4000
dtype: int64


- **Standard deviation**

In [335]:
df.std()

one         1.290994
two        12.909944
three     129.099445
four     1290.994449
dtype: float64

### 10 Describe function

The describe() method in a Pandas DataFrame returns descriptive statistics of the data in the DataFrame. It provides a quick summary of the central tendency, dispersion, and shape of the distribution of a set of numerical data.

The default behavior of describe() is to compute descriptive statistics for all numerical columns in the DataFrame. If you want to compute descriptive statistics for a specific column, you can pass the name of the column as an argument.

In [343]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000]),
        'five' : pd.Series(['A','B','C','D'])}


df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four,five
0,1,10,100,1000,A
1,2,20,200,2000,B
2,3,30,300,3000,C
3,4,40,400,4000,D


In [345]:
df.describe()

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,2.5,25.0,250.0,2500.0
std,1.290994,12.909944,129.099445,1290.994449
min,1.0,10.0,100.0,1000.0
25%,1.75,17.5,175.0,1750.0
50%,2.5,25.0,250.0,2500.0
75%,3.25,32.5,325.0,3250.0
max,4.0,40.0,400.0,4000.0


### 11 Pipe functions

##### Pipe function

The pipe() method in a Pandas DataFrame allows you to apply a function to the DataFrame, similar to the way the apply() method works. The difference is that pipe() allows you to chain multiple operations together by passing the output of one function to the input of the next function.

In [417]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000])}
def add_(i,j):
    return i + j
def sub_(i,j):
    return i - j    


df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [419]:
df.pipe(add_, 10)   #Add function

Unnamed: 0,one,two,three,four
0,11,20,110,1010
1,12,30,210,2010
2,13,40,310,3010
3,14,50,410,4010


In [421]:
df.pipe(sub_, 10)  #Subract function

Unnamed: 0,one,two,three,four
0,-9,0,90,990
1,-8,10,190,1990
2,-7,20,290,2990
3,-6,30,390,3990


In [423]:
def mean_(col):
    return col.mean()
def square(i):  
    return i ** 2

In [425]:
print(df.pipe(mean_))
print(df.pipe(mean_).pipe(square))

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64
one            6.25
two          625.00
three      62500.00
four     6250000.00
dtype: float64


In [427]:
df.pipe(square)

Unnamed: 0,one,two,three,four
0,1,100,10000,1000000
1,4,400,40000,4000000
2,9,900,90000,9000000
3,16,1600,160000,16000000


##### Apply function
The apply() method in a Pandas DataFrame allows you to apply a function to the DataFrame, either to individual elements or to the entire DataFrame. The function can be either a built-in Python function or a user-defined function.

In [452]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [454]:
print(df.apply(np.mean))
print(df.apply(np.max))
print(df.apply(np.min))

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64
one         4
two        40
three     400
four     4000
dtype: int64
one         1
two        10
three     100
four     1000
dtype: int64


In [456]:
df.apply(lambda x: x.max() - x.min())

one         3
two        30
three     300
four     3000
dtype: int64

##### Appply map function
The map() method in a Pandas DataFrame allows you to apply a function to each element of a specific column of the DataFrame. The function can be either a built-in Python function or a user-defined function.

In [461]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [475]:
df.map(lambda x: x*100)

Unnamed: 0,one,two,three,four
0,100,1000,10000,100000
1,200,2000,20000,200000
2,300,3000,30000,300000
3,400,4000,40000,400000


applymap and apply are both functions in the pandas library used for applying a function to elements of a pandas DataFrame or Series.

applymap is used to apply a function to every element of a DataFrame. It returns a new DataFrame where each element has been modified by the input function.

apply is used to apply a function along any axis of a DataFrame or Series. It returns either a Series or a DataFrame, depending on the axis along which the function is applied and the return value of the function. Unlike applymap, apply can take into account the context of the data, such as the row or column label.

So, applymap is meant for element-wise operations while apply can be used for both element-wise and row/column-wise operations.

In [503]:
df = pd.DataFrame({ 'A': [1.2, 3.4, 5.6], 
                    'B': [7.8, 9.1, 2.3]})
print(df)
df_1 = df.map(np.int64)
print(df_1)

df_2 = df.apply(lambda row : row.mean(), axis = 0)
print(df_2)
df_3 = df.apply(lambda row : row.mean(), axis = 1)
print(df_3)

     A    B
0  1.2  7.8
1  3.4  9.1
2  5.6  2.3
   A  B
0  1  7
1  3  9
2  5  2
A    3.4
B    6.4
dtype: float64
0    4.50
1    6.25
2    3.95
dtype: float64


### 12 Reindex function

The reindex function in Pandas is used to change the row labels and/or column labels of a DataFrame. This function can be used to align data from multiple DataFrames or to update the labels based on new data. The function takes in a list or an array of new labels as its first argument and, optionally, a fill value to replace any missing values. The reindexing can be done along either the row axis (0) or the column axis (1). The reindexed DataFrame is returned.

**This is row wise re indexing** 

In [518]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
print(df)
print("*"*25)
print(df.reindex([1,2,3,0]))

   one  two  three  four
0    1   10    100  1000
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000
*************************
   one  two  three  four
1    2   20    200  2000
2    3   30    300  3000
3    4   40    400  4000
0    1   10    100  1000


**This is column wise indexing**

In [533]:
data = {'Name' : ['John', 'Jane', 'Jim', 'Joan'],
        'Age'  : [25, 30, 35, 40],
        'City' : ['New York', 'Los Angeles', 'Chicago', 'Houston']}

df = pd.DataFrame(data)   #original
print("Orignal")
print(df)
print("*"*25)
print("Rows and columns wise re-indexing")
df1 = df.reindex([1,2,3,0], columns = ['Name','City','Age'])  # both  rows and  columns re-indexing
print(df1)
print("*"*25)
print("Columns-wise")
df2 = df.reindex(columns = ['Name','City','Age'])  # column wise
print(df2)

Orignal
   Name  Age         City
0  John   25     New York
1  Jane   30  Los Angeles
2   Jim   35      Chicago
3  Joan   40      Houston
*************************
Rows and columns wise re-indexing
   Name         City  Age
1  Jane  Los Angeles   30
2   Jim      Chicago   35
3  Joan      Houston   40
0  John     New York   25
*************************
Columns-wise
   Name         City  Age
0  John     New York   25
1  Jane  Los Angeles   30
2   Jim      Chicago   35
3  Joan      Houston   40


### 13 Renaming columns
The rename function in Pandas is used to change the row labels and/or column labels of a DataFrame. It can be used to update the names of one or multiple rows or columns by passing a dictionary of new names as its argument. The dictionary should have the old names as keys and the new names as values

In [537]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [539]:
df.rename(columns = {'one' : 'One','two': 'Two', 'three' : 'Three', 'four' : 'Four'}, 
           inplace = True, index = {0:'a',1:'b',2:'c',4:'d'})
df

Unnamed: 0,One,Two,Three,Four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
3,4,40,400,4000


### 14 Sorting 
Pandas provides several methods to sort a DataFrame based on one or more columns.

- sort_values: This method sorts the DataFrame based on one or more columns. The default sorting order is ascending, but you can change it to descending by passing the ascending argument with a value of False. bash

In [547]:
data = { 'one'   : pd.Series([11, 51, 31, 41]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 500, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,11,10,100,1000
1,51,20,200,2000
2,31,30,500,3000
3,41,40,400,4000


- **Sort with  respect to specific column**

In [558]:
df.sort_values(by = 'one')

Unnamed: 0,one,two,three,four
0,11,10,100,1000
2,31,30,500,3000
3,41,40,400,4000
1,51,20,200,2000


- **Sort in specific order**

In [570]:
df.sort_values(by = 'one' , ascending = False)

Unnamed: 0,one,two,three,four
1,51,20,200,2000
3,41,40,400,4000
2,31,30,500,3000
0,11,10,100,1000


- **Sort in specific order based on multiple columns**

In [579]:
df.sort_values(by = ['one', 'two'])

Unnamed: 0,one,two,three,four
0,11,10,100,1000
2,31,30,500,3000
3,41,40,400,4000
1,51,20,200,2000


- **Sort in specific order based on sorting  algorithms**
- quicksort
- mergesort
- heapsort

In [582]:
df.sort_values(by = 'one', kind = 'heapsort')

Unnamed: 0,one,two,three,four
0,11,10,100,1000
2,31,30,500,3000
3,41,40,400,4000
1,51,20,200,2000


### 15 Groupby function
The groupby function in pandas is used to split a dataframe into groups based on one or more columns. It returns a DataFrameGroupBy object, which is similar to a DataFrame but has some additional methods to perform operations on the grouped data.

In [652]:
cricket = {'Team'   : ['India', 'India', 'Australia', 'Australia', 'SA', 'SA', 'SA', 'SA', 'NZ', 'NZ', 'NZ', 'India'],
           'Rank'   : [2, 3, 1,2, 3,4 ,1 ,1,2 , 4,1,2],
           'Year'   : [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
           'Points' : [876,801,891,815,776,784,834,824,758,691,883,782]}

df = pd.DataFrame(cricket)
df

Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
2,Australia,1,2014,891
3,Australia,2,2015,815
4,SA,3,2014,776
5,SA,4,2015,784
6,SA,1,2016,834
7,SA,1,2017,824
8,NZ,2,2016,758
9,NZ,4,2014,691


In [654]:
df.groupby('Team').groups

{'Australia': [2, 3], 'India': [0, 1, 11], 'NZ': [8, 9, 10], 'SA': [4, 5, 6, 7]}

- Austrealia is present in index 2 and 3
- India is present in index 0,1 and 11 and so on

- **To search for specific country with specific year**

In [658]:
df.groupby(['Team','Year']).get_group(('Australia',2014))

Unnamed: 0,Team,Rank,Year,Points
2,Australia,1,2014,891


`If the data is not present then we will be getting an error`

- **Adding some statistical computation on top of groupby**

In [662]:
df.groupby('Team').sum()['Points']

Team
Australia    1706
India        2459
NZ           2332
SA           3218
Name: Points, dtype: int64

This means we have displayed the teams which are having the maximum sum in Poitns

- **Let us sort it to get it in a better way**

In [666]:
df.groupby('Team').sum()['Points'].sort_values(ascending = False)

Team
SA           3218
India        2459
NZ           2332
Australia    1706
Name: Points, dtype: int64

- **Checking multiple stats for points team wise**

In [669]:
groups = df.groupby('Team')

groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])

  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])


Unnamed: 0_level_0,sum,mean,std,max,min
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Australia,1706,853.0,53.740115,891,815
India,2459,819.666667,49.702448,876,782
NZ,2332,777.333333,97.449132,883,691
SA,3218,804.5,28.769196,834,776


- **filter function along with groupby**

In [672]:
print(df.groupby('Team').filter(lambda x : len(x) == 4))

  Team  Rank  Year  Points
4   SA     3  2014     776
5   SA     4  2015     784
6   SA     1  2016     834
7   SA     1  2017     824


- The data of South Africa are present equal to 4 times that is why South Africa is being displayed here

In [675]:
df.groupby('Team').filter(lambda x : len(x) == 3)

Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
8,NZ,2,2016,758
9,NZ,4,2014,691
10,NZ,1,2015,883
11,India,2,2017,782


- The data of India and New Zealand are present 3 times so that is why they are being displayed here

In [678]:
df.groupby('Team').filter(lambda x : len(x) == 2)

Unnamed: 0,Team,Rank,Year,Points
2,Australia,1,2014,891
3,Australia,2,2015,815


- The data of Australia is present 2 times so that is why they are being displayed here

# C Working with csv files and basic data Analysis Using Pandas

### 1 Reading CSV

In [705]:
df = pd.read_csv('Football.csv')

df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


- Reading CSV files from github repositories
- **NOTE**: The link of the page should be copied when the file is in raw format

In [708]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

#df = pd.read_csv(link)
#df.head()

### 2 Pandas info() function
Pandas dataframe.info() function is used to get a concise summary of the dataframe. It comes really handy when doing exploratory analysis of the data. To get a quick overview of the dataset we use the dataframe.info() function.

Syntax: DataFrame.info(verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None)

In [711]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Country                  660 non-null    object 
 1   League                   660 non-null    object 
 2   Club                     626 non-null    object 
 3   Player Names             660 non-null    object 
 4   Matches_Played           660 non-null    int64  
 5   Substitution             660 non-null    int64  
 6   Mins                     660 non-null    int64  
 7   Goals                    660 non-null    int64  
 8   xG                       660 non-null    float64
 9   xG Per Avg Match         660 non-null    float64
 10  Shots                    660 non-null    int64  
 11  OnTarget                 660 non-null    int64  
 12  Shots Per Avg Match      660 non-null    float64
 13  On Target Per Avg Match  660 non-null    float64
 14  Year                     6

### 3 isnull() function to check if there are nan values present

In [722]:
df.isnull().sum()

Country                     0
League                      0
Club                       34
Player Names                0
Matches_Played              0
Substitution                0
Mins                        0
Goals                       0
xG                          0
xG Per Avg Match            0
Shots                       0
OnTarget                    0
Shots Per Avg Match         0
On Target Per Avg Match     0
Year                        0
dtype: int64

### 4 Quantile function to get the specific percentile value
Let us check the 80 percentile value of each columns using describe function first

In [726]:
df.describe(percentiles = [.80])

Unnamed: 0,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,22.371212,3.224242,2071.416667,11.810606,10.089606,0.476167,64.177273,28.365152,2.948015,1.315652,2018.363636
std,9.754658,3.839498,900.595049,6.075315,5.724844,0.192831,34.941622,16.363149,0.914906,0.474239,1.3677
min,2.0,0.0,264.0,2.0,0.71,0.07,5.0,2.0,0.8,0.24,2016.0
50%,24.0,2.0,2245.5,11.0,9.285,0.435,62.0,26.0,2.845,1.25,2019.0
80%,32.0,6.0,2915.8,15.0,14.076,0.61,90.0,39.0,3.6,1.63,2020.0
max,38.0,26.0,4177.0,42.0,32.54,1.35,208.0,102.0,7.2,3.63,2020.0


`so we can see the 80th Percentile value of Mins is 2915.80`

- Let us use the quantile function to get the exact value now

In [731]:
df['Mins'].quantile(.80)

2915.8

`Here we go, we got the same value`

- To get the 99 percentile value we can write

In [736]:
df['Mins'].quantile(.99)

3520.0199999999995

**`This function is important as it can be used to treat ourliers in Data Science EDA process`**

### 5 Copy function
If we normal do:
de=df
Then a change in de will affect the data of df as well so we need to copy in such a way that it creates a totally new object and does not affect the old dataframe

In [744]:
de = df.copy()
de.head(3)

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016


In [746]:
de['Year+100'] = de['Year'] + 100
de.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year,Year+100
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016,2116
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016,2116
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016,2116
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016,2116
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016,2116


`So we can see a new column has been added here but our old data is secured`

In [749]:
df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


`The new column is not present here`

### 6 Value count function
Pandas Series.value_counts() function return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default.

Syntax: Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)

In [754]:
df['Player Names'].value_counts()

Player Names
Andrea Belotti     5
Lionel Messi       5
Luis Suarez        5
Andrej Kramaric    5
Ciro Immobile      5
                  ..
Francois Kamano    1
Lebo Mothiba       1
Gaetan Laborde     1
Falcao             1
Cody Gakpo         1
Name: count, Length: 444, dtype: int64

### 7 Unique and Nunique function
While analyzing the data, many times the user wants to see the unique values in a particular column, which can be done using Pandas unique() function.

In [762]:
df['Player Names'].unique()

array(['Juanmi Callejon', 'Antoine Griezmann', 'Luis Suarez',
       'Ruben Castro', 'Kevin Gameiro', 'Cristiano Ronaldo',
       'Karim Benzema', 'Neymar ', 'Iago Aspas', 'Sergi Enrich',
       'Aduriz ', 'Sandro Ramlrez', 'Lionel Messi', 'Gerard Moreno',
       'Morata', 'Wissam Ben Yedder', 'Willian Jose', 'Andone ',
       'Cedric Bakambu', 'Isco', 'Mohamed Salah', 'Gregoire Defrel',
       'Ciro Immobile', 'Nikola Kalinic', 'Dries Mertens',
       'Alejandro Gomez', 'Jose CallejOn', 'Iago Falque',
       'Giovanni Simeone', 'Mauro Icardi', 'Diego Falcinelli',
       'Cyril Thereau', 'Edin Dzeko', 'Lorenzo Insigne',
       'Fabio Quagliarella', 'Borriello ', 'Carlos Bacca',
       'Gonzalo Higuain', 'Keita Balde', 'Andrea Belotti', 'Fin Bartels',
       'Lars Stindl', 'Serge Gnabry', 'Wagner ', 'Andrej Kramaric',
       'Florian Niederlechner', 'Robert Lewandowski', 'Emil Forsberg',
       'Timo Werner', 'Nils Petersen', 'Vedad Ibisevic', 'Mario Gomez',
       'Maximilian Philipp',

In [764]:
df['Player Names'].nunique()

444

### 8 dropna() function
Sometimes csv file has null values, which are later displayed as NaN in Data Frame. Pandas dropna() method allows the user to analyze and drop Rows/Columns with Null values in different ways.

Syntax:

DataFrameName.dropna(axis=0,inplace=False)dropna() function

**axis: axis takes int or string value for rows/columns. Input can be 0 or 1 for Integer and ‘index’ or ‘columns’ for String.**

In [769]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

df = pd.read_csv(link)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [771]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

`ok so it seems like we have alot of Null Values in column Rating and few null values in some other columnsok so it seems like we have alot of Null Values in column Rating and few null values in some other columns`

In [774]:
df.dropna(inplace = True, axis = 0)
#This will delete all the rows which are containing the null values

In [776]:
df.dropna(inplace = True, axis = 1)
#This will delete all the columns containing null values

### 9 Fillna Function
Pandas Series.fillna() function is used to fill NA/NaN values using the specified method.


**Suppose if we want to fill the null values with something instead of removing them then we can use fillna function
Here we will be filling the numerical columns with its `mean values` and Categorical columns with its `mode`**

In [794]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

df = pd.read_csv(link)
print(df['Rating'])
print("*"*10)
print(len(df))

0        4.1
1        3.9
2        4.7
3        4.5
4        4.3
        ... 
10836    4.5
10837    5.0
10838    NaN
10839    4.5
10840    4.5
Name: Rating, Length: 10841, dtype: float64
**********
10841


- **Numerical Values**

In [799]:
mis = round(df['Rating'].mean(),2)

df['Rating'] = df['Rating'].fillna(mis)

print(len(df))

10841


In [805]:
df['Rating'].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
10836    False
10837    False
10838    False
10839    False
10840    False
Name: Rating, Length: 10841, dtype: bool

- **Categorical values**

If we would have used inplcae=True then it would have permenantly stored those values in our dataframe

In [810]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       0
Android Ver       3
dtype: int64

In [812]:
df['Current Ver'] = df['Current Ver'].fillna('Varies on Device')

In [814]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       0
Android Ver       3
dtype: int64

In [816]:
df['Android Ver'] = df['Android Ver'].fillna('Version upgrading')

In [818]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

### 10 Sample function
Pandas sample() is used to generate a sample random row or column from the function caller data frame.

Syntax:

DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)

In [827]:
df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
3118,trivago: Hotels & Travel,TRAVEL_AND_LOCAL,4.2,219848,Varies with device,"50,000,000+",Free,0,Everyone,Travel & Local,"August 2, 2018",Varies with device,Varies with device
5214,MiAI (Artificial Intelligence) Assistant,PRODUCTIVITY,3.7,3,7.5M,100+,Free,0,Everyone,Productivity,"March 1, 2018",11.04,4.1 and up
4544,R Programming Solution,FAMILY,3.9,169,7.7M,"10,000+",Free,0,Everyone,Education,"June 20, 2016",1.0.3,4.0 and up
3651,Weather From DMI/YR,WEATHER,4.3,2143,Varies with device,"100,000+",Free,0,Everyone,Weather,"July 31, 2018",Varies with device,Varies with device
5926,Learn alif ba ta,FAMILY,4.8,23,35M,"10,000+",Free,0,Everyone,Educational;Education,"November 28, 2017",1.0.2,2.3 and up


### 11 to_csv() function
Pandas Series.to_csv() function write the given series object to a comma-separated values (csv) file/format.

Syntax: Series.to_csv(*args, **kwargs)

In [839]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)

df.to_csv('Number.csv')

- `We got an extra Unnamed:0 Column if we want to avoid that we need to add an extra parameter mentioning index=False`

In [842]:
df.to_csv('Numbers.csv', index = False)

# D-> Detailed Pandas Profile report
The pandas_profiling library in Python include a method named as ProfileReport() which generate a basic report on the input DataFrame.

The report consist of the following:

DataFrame overview, Each attribute on which DataFrame is defined, Correlations between attributes (Pearson Correlation and Spearman Correlation), and A sample of DataFrame.

In [852]:
import matplotlib
import ydata_profiling as pp

In [854]:
df = pd.read_csv('Football.csv')
df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


In [872]:
report = pp.ProfileReport(df)

In [884]:
#pip install ipywidgets

In [886]:
#jupyter nbextension enable --py widgetsnbextension

In [888]:
#pip install -U pandas-profiling

In [893]:
#pip install --upgrade joblib

In [1]:
#pip install visions[type_image_path]==0.7.5

In [3]:
#pip uninstall joblib visions ydata-profiling pandas-profiling



In [4]:
#pip install joblib==1.1.0
