# Pandas

In [1]:
import pandas as pd
import numpy as np

## Creating a DataFrame

There are two ways of calling a column in a DataFrame as a series


In [None]:
#1 With single double brackets

df['order_status']

#2 As an attribute

orders.order_status


### pd.Series.unique() Method

This method can be used with pandas **Series**, not DataFrames, and will return a list of all the unique values in the series. 

In [None]:
df['order_status'].unique()

In [None]:
df.order_status.unique()

### pd.to_datetime() Method

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html

Note that in the example below, the information within a dataframe should be called as a Series (with single brackets), not as the column of a dataframe.

If a DataFrame is provided, the method expects minimally the following columns: "year", "month", "day".

In [None]:
df["Date"]= pd.to_datetime(df["Date"])

In [None]:
data.info()

pd.timedelta

### pd.DataFrame.groupby() Method

Needs to have an action to do on the rows not being 'grouped by'.

In [None]:
df[['order_id','product_id']].groupby('order_id').count()

In [None]:
df[['order_id','product_id']].groupby('order_id').sum()

In [None]:
order_items[['order_id','seller_id']].groupby('order_id').nunique()

### pd.Series.agg() Method

**CAN BE USED IN GROUPBY() ON A DATAFRAME**

Allows you to perform different operations on columns that are grouped in a single 

In [None]:
df[['order_id','price','freight_value']].groupby('order_id').agg({'price': ['sum', 'mean'], 'freight_value': 'mean' })

If you need to join multiple string columns, you can use agg:

In [None]:
df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1)

### pd.DataFrame.merge() Method

This can be written in two different ways, allowing for both chaining or nesting of multiple merges.

In [None]:
A_df.merge(B_df, how='left', on='order_id')

In [None]:
pd.merge(A_df, B_df, how='left', on='order_id')

In [None]:
df.query("order_status == 'delivered'")

### pd.Series.map() Method

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html

Below is how you would use the map() method to change certain values in a series to another, using a dictionary.

In [8]:
df = pd.DataFrame({'Stuff':['cat', 'dog', np.nan, 'rabbit']})

df.Stuff

0       cat
1       dog
2       NaN
3    rabbit
Name: Stuff, dtype: object

-- You can substitute stuff with a dictionary. **Note that anything not specified is converted to a NaN**

In [10]:
df.Stuff.map({
    'cat': 'kitten', 
    'dog': 'puppy'
})

0    kitten
1     puppy
2       NaN
3       NaN
Name: Stuff, dtype: object

-- It also accepts a function

In [12]:
df.Stuff.map('I am a {}'.format)

0       I am a cat
1       I am a dog
2       I am a nan
3    I am a rabbit
Name: Stuff, dtype: object

-- To avoid applying the function to missing values (and keep them as NaN) na_action='ignore' can be used:

In [13]:
df.Stuff.map('I am a {}'.format, na_action='ignore')

0       I am a cat
1       I am a dog
2              NaN
3    I am a rabbit
Name: Stuff, dtype: object

### How to add another row to a DataFrame

In [None]:
#add row to end of DataFrame
df.loc[len(df.index)] = [value1, value2, value3, ...]

In [None]:
#append rows of df2 to end of existing DataFrame
df = df.append(df2, ignore_index = True)

### pd.Series.apply() Method


In [None]:
def txt_rp(x):

    for punc in string.punctuation:
        x = x.replace(punc, '')    

    return x

df['clean_text'] = df['text'].apply(txt_rp)

## Lambda


**Example 1**: Applying lambda function to single column using Dataframe.assign()

In the below example, the lambda function is applied to the ‘Total_Marks’ column and a new column ‘Percentage’ is formed with the help of it.

In [2]:
import pandas as pd
  
# creating and initializing a list
values= [['Rohan',455],['Elvish',250],['Deepak',495],
         ['Soni',400],['Radhika',350],['Vansh',450]]
 
# creating a pandas dataframe
df = pd.DataFrame(values,columns=['Name','Total_Marks'])
 
# Applying lambda function to find
# percentage of 'Total_Marks' column
# using df.assign()
df = df.assign(Percentage = lambda x: (x['Total_Marks'] /500 * 100))
 
# displaying the data frame
df

Unnamed: 0,Name,Total_Marks,Percentage
0,Rohan,455,91.0
1,Elvish,250,50.0
2,Deepak,495,99.0
3,Soni,400,80.0
4,Radhika,350,70.0
5,Vansh,450,90.0


**Example 2**: Applying lambda function to multiple columns using Dataframe.assign()

In the below example, lambda function is applied to 3 columns i.e ‘Field_1’, ‘Field_2’, and ‘Field_3’.

In [1]:
# importing pandas library
import pandas as pd
 
# creating and initializing a nested list
values_list = [[15, 2.5, 100], [20, 4.5, 50], [25, 5.2, 80],
               [45, 5.8, 48], [40, 6.3, 70], [41, 6.4, 90],
               [51, 2.3, 111]]
 
# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'])
 
# Applying lambda function to find
# the product of 3 columns using
# df.assign()
df = df.assign(Product=lambda x: (x['Field_1'] * x['Field_2'] * x['Field_3']))
 
# printing dataframe
df

Unnamed: 0,Field_1,Field_2,Field_3,Product
0,15,2.5,100,3750.0
1,20,4.5,50,4500.0
2,25,5.2,80,10400.0
3,45,5.8,48,12528.0
4,40,6.3,70,17640.0
5,41,6.4,90,23616.0
6,51,2.3,111,13020.3


**Example 3**: Applying lambda function to single row using Dataframe.apply()

In the below example, a lambda function is applied to row starting with ‘d’ and hence square all values corresponds to it.

In [3]:
import pandas as pd
import numpy as np
 
# creating and initializing a nested list
values_list = [[15, 2.5, 100], [20, 4.5, 50], [25, 5.2, 80],
               [45, 5.8, 48], [40, 6.3, 70], [41, 6.4, 90],
               [51, 2.3, 111]]
 
# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'],
                  index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
 
 
# Apply function numpy.square() to square
# the values of one row only i.e. row
# with index name 'd'
df = df.apply(lambda x: np.square(x) if x.name == 'd' else x, axis=1)
 
 
# printing dataframe
df

Unnamed: 0,Field_1,Field_2,Field_3
a,15.0,2.5,100.0
b,20.0,4.5,50.0
c,25.0,5.2,80.0
d,2025.0,33.64,2304.0
e,40.0,6.3,70.0
f,41.0,6.4,90.0
g,51.0,2.3,111.0


**Example 4**: Applying lambda function to multiple rows using Dataframe.apply()

In the below example, a lambda function is applied to 3 rows starting with ‘a’, ‘e’, and ‘g’.

In [4]:
# importing pandas and numpylibraries
import pandas as pd
import numpy as np
 
# creating and initializing a nested list
values_list = [[15, 2.5, 100], [20, 4.5, 50], [25, 5.2, 80],
               [45, 5.8, 48], [40, 6.3, 70], [41, 6.4, 90],
               [51, 2.3, 111]]
 
# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'],
                  index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
 
 
# Apply function numpy.square() to square
# the values of 3 rows only i.e. with row
# index name 'a', 'e' and 'g' only
df = df.apply(lambda x: np.square(x) if x.name in [
              'a', 'e', 'g'] else x, axis=1)
 
# printing dataframe
df

Unnamed: 0,Field_1,Field_2,Field_3
a,225.0,6.25,10000.0
b,20.0,4.5,50.0
c,25.0,5.2,80.0
d,45.0,5.8,48.0
e,1600.0,39.69,4900.0
f,41.0,6.4,90.0
g,2601.0,5.29,12321.0


**Example 5**: Applying the lambda function simultaneously to multiple columns and rows

In this example, a lambda function is applied to two rows and three columns. 

In [5]:
# importing pandas and numpylibraries
import pandas as pd
import numpy as np
 
# creating and initializing a nested list
values_list = [[1.5, 2.5, 10.0], [2.0, 4.5, 5.0], [2.5, 5.2, 8.0],
               [4.5, 5.8, 4.8], [4.0, 6.3, 70], [4.1, 6.4, 9.0],
               [5.1, 2.3, 11.1]]
 
# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'],
                  index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
 
 
# Apply function numpy.square() to square
# the values of 2 rows only i.e. with row
# index name 'b' and 'f' only
df = df.apply(lambda x: np.square(x) if x.name in ['b', 'f'] else x, axis=1)
 
# Applying lambda function to find product of 3 columns
# i.e 'Field_1', 'Field_2' and 'Field_3'
df = df.assign(Product=lambda x: (x['Field_1'] * x['Field_2'] * x['Field_3']))
 
 
# printing dataframe
df

Unnamed: 0,Field_1,Field_2,Field_3,Product
a,1.5,2.5,10.0,37.5
b,4.0,20.25,25.0,2025.0
c,2.5,5.2,8.0,104.0
d,4.5,5.8,4.8,125.28
e,4.0,6.3,70.0,1764.0
f,16.81,40.96,81.0,55771.5456
g,5.1,2.3,11.1,130.203


### zip()

I think the lists have to be the same length


In [None]:
zip(list1, list2)

In [None]:
list(zip(list1, list2)))

### iterrows()

In [3]:
import pandas as pd

data = {
  "firstname": ["Sally", "Mary", "John"],
  "age": [50, 40, 30]
}

df = pd.DataFrame(data)

for index, row in df.iterrows():
    print(index)
    print(row["firstname"]) 

0
Sally
1
Mary
2
John


### pd.Series.apply() Method


First you defind a function. 

Then you can use the pd.Series.apply() method to apply the function of a column of a dataframe. 

In [None]:
# Define the function

def txt_rp(x):

    for punc in string.punctuation:
        x = x.replace(punc, '')    

    return x

# Use the apply method to apply it to a column of a DataFrame

df['clean_text'] = df['text'].apply(txt_rp)

### How to add another row to a DataFrame

Below are two ways of adding new rows to a DataFrame

In [None]:
#add row to end of DataFrame

df.loc[len(df.index)] = [value1, value2, value3, ...]

In [None]:
#append rows of df2 to end of existing DataFrame

df = df.append(df2, ignore_index = True)

### How To Delete Rows In A DataFrame