In [282]:
import pandas as pd
import numpy as np

In [283]:
arr=pd.Series([1,2,3,4])
print(arr)

0    1
1    2
2    3
3    4
dtype: int64


In [284]:
arr=pd.Series([1,2,3,4,"abc"])
print(arr)

0      1
1      2
2      3
3      4
4    abc
dtype: object


In [285]:
print(arr.dtype)
print(arr.index)

object
RangeIndex(start=0, stop=5, step=1)


In [286]:
print(arr.name)

None


In [287]:
arr.name="Numbers"
print(arr)

0      1
1      2
2      3
3      4
4    abc
Name: Numbers, dtype: object


## Indexing

In [288]:
print(arr[0])
print("\n",arr[0:3])

1

 0    1
1    2
2    3
Name: Numbers, dtype: object


## iloc -> location based indexing

In [289]:
print(arr.iloc[3])
print(arr[3])

print("\n",arr.iloc[[1,3]])

4
4

 1    2
3    4
Name: Numbers, dtype: object


In [290]:
index= ["apple","banana","grapes","mango"]
arr=pd.Series([10,20,30,40])
arr.index=index
arr.name="calories"
print(arr)

print(arr["grapes"])
# Error ->  iloc only works with numeric indexing

# print(arr.iloc['banana'])  -> Give error

apple     10
banana    20
grapes    30
mango     40
Name: calories, dtype: int64
30


## loc -> label based indexing
### In label based indexing your start as well as stop value both are included in the output

In [291]:
print(arr.loc[["banana","mango"]])

banana    20
mango     40
Name: calories, dtype: int64


In [292]:
print(arr["banana":"mango"])

banana    20
grapes    30
mango     40
Name: calories, dtype: int64


In [293]:
dict = {
    "apple":40,
    "mango":60,
    "pears":70
    }
arr2=pd.Series(dict,name="proteins")
print(arr2)

apple    40
mango    60
pears    70
Name: proteins, dtype: int64


## Conditional Selection

In [294]:
print(arr2>40)

apple    False
mango     True
pears     True
Name: proteins, dtype: bool


In [295]:
print(arr2[arr2>40])

mango    60
pears    70
Name: proteins, dtype: int64


## Logical Operators

In [296]:
print((arr2>50) & (arr2<65))

apple    False
mango     True
pears    False
Name: proteins, dtype: bool


In [297]:
print(arr2[(arr2>70) | (arr2<65)])

apple    40
mango    60
Name: proteins, dtype: int64


In [298]:
print(arr2[~(arr2>60)])

apple    40
mango    60
Name: proteins, dtype: int64


## Modifying the series

In [299]:
arr2["apple"]=100
print(arr2)

apple    100
mango     60
pears     70
Name: proteins, dtype: int64


In [300]:
ser=pd.Series(['a',np.nan,1,np.nan,2])
print(ser.notnull().sum())

3


## Data Frames

In [301]:
data = {
    "Name":["Rohan","Mohan","Sohan_Agarwal","Priya_sehgal"],
    "age":[23,36,np.nan,40],
    "Salary":[45000,60000,np.nan,78000],
    "Department":["Frontend","manager","Tusty","HR"]
}
df=pd.DataFrame(data)
print(df)

            Name   age   Salary Department
0          Rohan  23.0  45000.0   Frontend
1          Mohan  36.0  60000.0    manager
2  Sohan_Agarwal   NaN      NaN      Tusty
3   Priya_sehgal  40.0  78000.0         HR


In [302]:
df.head(2) # print first two rows

Unnamed: 0,Name,age,Salary,Department
0,Rohan,23.0,45000.0,Frontend
1,Mohan,36.0,60000.0,manager


In [303]:
df.tail(1) # print last row

Unnamed: 0,Name,age,Salary,Department
3,Priya_sehgal,40.0,78000.0,HR


### loc and iloc

In [304]:
df.iloc[0:2]

Unnamed: 0,Name,age,Salary,Department
0,Rohan,23.0,45000.0,Frontend
1,Mohan,36.0,60000.0,manager


In [305]:
df.loc[0:2,["age","Department"]]

Unnamed: 0,age,Department
0,23.0,Frontend
1,36.0,manager
2,,Tusty


In [306]:
df.iloc[0:2,:2]

Unnamed: 0,Name,age
0,Rohan,23.0
1,Mohan,36.0


In [307]:
df[["age","Name"]]

Unnamed: 0,age,Name
0,23.0,Rohan
1,36.0,Mohan
2,,Sohan_Agarwal
3,40.0,Priya_sehgal


## To delete row / column

In [308]:
df.drop("age",axis=1)

Unnamed: 0,Name,Salary,Department
0,Rohan,45000.0,Frontend
1,Mohan,60000.0,manager
2,Sohan_Agarwal,,Tusty
3,Priya_sehgal,78000.0,HR


In [309]:
df

Unnamed: 0,Name,age,Salary,Department
0,Rohan,23.0,45000.0,Frontend
1,Mohan,36.0,60000.0,manager
2,Sohan_Agarwal,,,Tusty
3,Priya_sehgal,40.0,78000.0,HR


* To delete row permanently we have to give this command

df.drop("age",axis=1, inplace =True)

In [310]:
df.shape

(4, 4)

In [311]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        4 non-null      object 
 1   age         3 non-null      float64
 2   Salary      3 non-null      float64
 3   Department  4 non-null      object 
dtypes: float64(2), object(2)
memory usage: 260.0+ bytes


In [312]:
df.describe()

Unnamed: 0,age,Salary
count,3.0,3.0
mean,33.0,61000.0
std,8.888194,16522.711642
min,23.0,45000.0
25%,29.5,52500.0
50%,36.0,60000.0
75%,38.0,69000.0
max,40.0,78000.0


## Broadcasting


In [313]:
print(df["Salary"])
df["Salary"]=df["Salary"] + 10000
print(df["Salary"])

0    45000.0
1    60000.0
2        NaN
3    78000.0
Name: Salary, dtype: float64
0    55000.0
1    70000.0
2        NaN
3    88000.0
Name: Salary, dtype: float64


## Renaming columns

In [314]:
df.rename(columns={"Department":"Dept"},inplace=True)
df

Unnamed: 0,Name,age,Salary,Dept
0,Rohan,23.0,55000.0,Frontend
1,Mohan,36.0,70000.0,manager
2,Sohan_Agarwal,,,Tusty
3,Priya_sehgal,40.0,88000.0,HR


In [315]:
df["Salary"].unique()

array([55000., 70000.,    nan, 88000.])

In [316]:
df["Dept"].value_counts()

Dept
Frontend    1
manager     1
Tusty       1
HR          1
Name: count, dtype: int64

In [317]:
df["Promoted Salary"]=df["Salary"]*10
df

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary
0,Rohan,23.0,55000.0,Frontend,550000.0
1,Mohan,36.0,70000.0,manager,700000.0
2,Sohan_Agarwal,,,Tusty,
3,Priya_sehgal,40.0,88000.0,HR,880000.0


## Data cleaning

In [318]:
df.isnull().sum()


Name               0
age                1
Salary             1
Dept               0
Promoted Salary    1
dtype: int64

In [319]:
df.dropna()
#df.dropna(inplace=true) make chages in original dataframe

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary
0,Rohan,23.0,55000.0,Frontend,550000.0
1,Mohan,36.0,70000.0,manager,700000.0
3,Priya_sehgal,40.0,88000.0,HR,880000.0


In [320]:
df

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary
0,Rohan,23.0,55000.0,Frontend,550000.0
1,Mohan,36.0,70000.0,manager,700000.0
2,Sohan_Agarwal,,,Tusty,
3,Priya_sehgal,40.0,88000.0,HR,880000.0


## To fill NaN value in DataFrame

In [321]:
df["age"].fillna(df["age"].median())

0    23.0
1    36.0
2    36.0
3    40.0
Name: age, dtype: float64

### Forward fill

In [322]:
df["age"].ffill()

0    23.0
1    36.0
2    36.0
3    40.0
Name: age, dtype: float64

### Backward fill

In [323]:
df["age"].bfill()

0    23.0
1    36.0
2    40.0
3    40.0
Name: age, dtype: float64

## Edit elements of a row

In [324]:
print(df)
print("\n\n")

df["Name"]=df["Name"].replace("Rohan","Ram")
print(df)

            Name   age   Salary      Dept  Promoted Salary
0          Rohan  23.0  55000.0  Frontend         550000.0
1          Mohan  36.0  70000.0   manager         700000.0
2  Sohan_Agarwal   NaN      NaN     Tusty              NaN
3   Priya_sehgal  40.0  88000.0        HR         880000.0



            Name   age   Salary      Dept  Promoted Salary
0            Ram  23.0  55000.0  Frontend         550000.0
1          Mohan  36.0  70000.0   manager         700000.0
2  Sohan_Agarwal   NaN      NaN     Tusty              NaN
3   Priya_sehgal  40.0  88000.0        HR         880000.0


## Lambda function

In [325]:
df["Promoted Salary"] = df["Promoted Salary"].apply(lambda x: x/1000 if x > 650000 else x) 
df

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary
0,Ram,23.0,55000.0,Frontend,550000.0
1,Mohan,36.0,70000.0,manager,700.0
2,Sohan_Agarwal,,,Tusty,
3,Priya_sehgal,40.0,88000.0,HR,880.0


In [326]:
df[["firs_Name","last_Name"]]=df["Name"].str.split("_", expand=True)
df

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary,firs_Name,last_Name
0,Ram,23.0,55000.0,Frontend,550000.0,Ram,
1,Mohan,36.0,70000.0,manager,700.0,Mohan,
2,Sohan_Agarwal,,,Tusty,,Sohan,Agarwal
3,Priya_sehgal,40.0,88000.0,HR,880.0,Priya,sehgal


In [328]:
df["age"] = df["age"].apply(lambda x: x*2) 
df

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary,firs_Name,last_Name
0,Ram,92.0,55000.0,Frontend,550000.0,Ram,
1,Mohan,144.0,70000.0,manager,700.0,Mohan,
2,Sohan_Agarwal,,,Tusty,,Sohan,Agarwal
3,Priya_sehgal,160.0,88000.0,HR,880.0,Priya,sehgal


## Joins and Merges

In [330]:
data = {
    "Name":["Rohan","Mohan","Sohan_Agarwal","Priya_sehgal"],
    "age":[23,36,np.nan,40],
    "Salary":[45000,60000,np.nan,78000],
    "Department":["Frontend","manager","Tusty","HR"]
}
df=pd.DataFrame(data)

data2 = {
    "Name":["Ram","Manan","kirti","jack"],
    "age":[62,65,50,49],
    "Salary":[4500,6000,np.nan,85000],
    "Degree":["B.tech","B.Sc","MBA","CA"]
}
df2=pd.DataFrame(data2)

pd.concat([df,df2])

Unnamed: 0,Name,age,Salary,Department,Degree
0,Rohan,23.0,45000.0,Frontend,
1,Mohan,36.0,60000.0,manager,
2,Sohan_Agarwal,,,Tusty,
3,Priya_sehgal,40.0,78000.0,HR,
0,Ram,62.0,4500.0,,B.tech
1,Manan,65.0,6000.0,,B.Sc
2,kirti,50.0,,,MBA
3,jack,49.0,85000.0,,CA


In [331]:
pd.concat([df,df2],axis=1)

Unnamed: 0,Name,age,Salary,Department,Name.1,age.1,Salary.1,Degree
0,Rohan,23.0,45000.0,Frontend,Ram,62,4500.0,B.tech
1,Mohan,36.0,60000.0,manager,Manan,65,6000.0,B.Sc
2,Sohan_Agarwal,,,Tusty,kirti,50,,MBA
3,Priya_sehgal,40.0,78000.0,HR,jack,49,85000.0,CA


In [337]:
pd.merge(df,df2, on=["Salary","age"])


Unnamed: 0,Name_x,age,Salary,Department,Name_y,Degree
