In [198]:
import pandas as pd
import numpy as np

In [199]:
arr=pd.Series([1,2,3,4])
print(arr)

0    1
1    2
2    3
3    4
dtype: int64


In [200]:
arr=pd.Series([1,2,3,4,"abc"])
print(arr)

0      1
1      2
2      3
3      4
4    abc
dtype: object


In [201]:
print(arr.dtype)
print(arr.index)

object
RangeIndex(start=0, stop=5, step=1)


In [202]:
print(arr.name)

None


In [203]:
arr.name="Numbers"
print(arr)

0      1
1      2
2      3
3      4
4    abc
Name: Numbers, dtype: object


## Indexing

In [204]:
print(arr[0])
print("\n",arr[0:3])

1

 0    1
1    2
2    3
Name: Numbers, dtype: object


## iloc -> location based indexing

In [205]:
print(arr.iloc[3])
print(arr[3])

print("\n",arr.iloc[[1,3]])

4
4

 1    2
3    4
Name: Numbers, dtype: object


In [206]:
index= ["apple","banana","grapes","mango"]
arr=pd.Series([10,20,30,40])
arr.index=index
arr.name="calories"
print(arr)

print(arr["grapes"])
# Error ->  iloc only works with numeric indexing

# print(arr.iloc['banana'])  -> Give error

apple     10
banana    20
grapes    30
mango     40
Name: calories, dtype: int64
30


## loc -> label based indexing
### In label based indexing your start as well as stop value both are included in the output

In [207]:
print(arr.loc[["banana","mango"]])

banana    20
mango     40
Name: calories, dtype: int64


In [208]:
print(arr["banana":"mango"])

banana    20
grapes    30
mango     40
Name: calories, dtype: int64


In [209]:
dict = {
    "apple":40,
    "mango":60,
    "pears":70
    }
arr2=pd.Series(dict,name="proteins")
print(arr2)

apple    40
mango    60
pears    70
Name: proteins, dtype: int64


## Conditional Selection

In [210]:
print(arr2>40)

apple    False
mango     True
pears     True
Name: proteins, dtype: bool


In [211]:
print(arr2[arr2>40])

mango    60
pears    70
Name: proteins, dtype: int64


## Logical Operators

In [212]:
print((arr2>50) & (arr2<65))

apple    False
mango     True
pears    False
Name: proteins, dtype: bool


In [213]:
print(arr2[(arr2>70) | (arr2<65)])

apple    40
mango    60
Name: proteins, dtype: int64


In [214]:
print(arr2[~(arr2>60)])

apple    40
mango    60
Name: proteins, dtype: int64


## Modifying the series

In [215]:
arr2["apple"]=100
print(arr2)

apple    100
mango     60
pears     70
Name: proteins, dtype: int64


In [216]:
ser=pd.Series(['a',np.nan,1,np.nan,2])
print(ser.notnull().sum())

3


## Data Frames

In [217]:
data = {
    "Name":["Rohan","Mohan","Sohan","Priya"],
    "age":[23,36,np.nan,40],
    "Salary":[45000,60000,np.nan,78000],
    "Department":["Frontend","manager","Tusty","HR"]
}
df=pd.DataFrame(data)
print(df)

    Name   age   Salary Department
0  Rohan  23.0  45000.0   Frontend
1  Mohan  36.0  60000.0    manager
2  Sohan   NaN      NaN      Tusty
3  Priya  40.0  78000.0         HR


In [218]:
df.head(2) # print first two rows

Unnamed: 0,Name,age,Salary,Department
0,Rohan,23.0,45000.0,Frontend
1,Mohan,36.0,60000.0,manager


In [219]:
df.tail(1) # print last row

Unnamed: 0,Name,age,Salary,Department
3,Priya,40.0,78000.0,HR


### loc and iloc

In [220]:
df.iloc[0:2]

Unnamed: 0,Name,age,Salary,Department
0,Rohan,23.0,45000.0,Frontend
1,Mohan,36.0,60000.0,manager


In [221]:
df.loc[0:2,["age","Department"]]

Unnamed: 0,age,Department
0,23.0,Frontend
1,36.0,manager
2,,Tusty


In [222]:
df.iloc[0:2,:2]

Unnamed: 0,Name,age
0,Rohan,23.0
1,Mohan,36.0


In [223]:
df[["age","Name"]]

Unnamed: 0,age,Name
0,23.0,Rohan
1,36.0,Mohan
2,,Sohan
3,40.0,Priya


## To delete row / column

In [224]:
df.drop("age",axis=1)

Unnamed: 0,Name,Salary,Department
0,Rohan,45000.0,Frontend
1,Mohan,60000.0,manager
2,Sohan,,Tusty
3,Priya,78000.0,HR


In [225]:
df

Unnamed: 0,Name,age,Salary,Department
0,Rohan,23.0,45000.0,Frontend
1,Mohan,36.0,60000.0,manager
2,Sohan,,,Tusty
3,Priya,40.0,78000.0,HR


* To delete row permanently we have to give this command

df.drop("age",axis=1, inplace =True)

In [226]:
df.shape

(4, 4)

In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        4 non-null      object 
 1   age         3 non-null      float64
 2   Salary      3 non-null      float64
 3   Department  4 non-null      object 
dtypes: float64(2), object(2)
memory usage: 260.0+ bytes


In [228]:
df.describe()

Unnamed: 0,age,Salary
count,3.0,3.0
mean,33.0,61000.0
std,8.888194,16522.711642
min,23.0,45000.0
25%,29.5,52500.0
50%,36.0,60000.0
75%,38.0,69000.0
max,40.0,78000.0


## Broadcasting


In [229]:
print(df["Salary"])
df["Salary"]=df["Salary"] + 10000
print(df["Salary"])

0    45000.0
1    60000.0
2        NaN
3    78000.0
Name: Salary, dtype: float64
0    55000.0
1    70000.0
2        NaN
3    88000.0
Name: Salary, dtype: float64


## Renaming columns

In [230]:
df.rename(columns={"Department":"Dept"},inplace=True)
df

Unnamed: 0,Name,age,Salary,Dept
0,Rohan,23.0,55000.0,Frontend
1,Mohan,36.0,70000.0,manager
2,Sohan,,,Tusty
3,Priya,40.0,88000.0,HR


In [231]:
df["Salary"].unique()

array([55000., 70000.,    nan, 88000.])

In [232]:
df["Dept"].value_counts()

Dept
Frontend    1
manager     1
Tusty       1
HR          1
Name: count, dtype: int64

In [233]:
df["Promoted Salary"]=df["Salary"]*10
df

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary
0,Rohan,23.0,55000.0,Frontend,550000.0
1,Mohan,36.0,70000.0,manager,700000.0
2,Sohan,,,Tusty,
3,Priya,40.0,88000.0,HR,880000.0


## Data cleaning

In [234]:
df.isnull().sum()


Name               0
age                1
Salary             1
Dept               0
Promoted Salary    1
dtype: int64

In [None]:
df.dropna()
#df.dropna(inplace=true) make chages in original dataframe

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary
0,Rohan,23.0,55000.0,Frontend,550000.0
1,Mohan,36.0,70000.0,manager,700000.0
3,Priya,40.0,88000.0,HR,880000.0


In [None]:
df

Unnamed: 0,Name,age,Salary,Dept,Promoted Salary
0,Rohan,23.0,55000.0,Frontend,550000.0
1,Mohan,36.0,70000.0,manager,700000.0
2,Sohan,,,Tusty,
3,Priya,40.0,88000.0,HR,880000.0


## To fill NaN value in DataFrame

In [246]:
df["age"].fillna(df["age"].median())

0    23.0
1    36.0
2    36.0
3    40.0
Name: age, dtype: float64

### Forward fill

In [251]:
df["age"].ffill()

0    23.0
1    36.0
2    36.0
3    40.0
Name: age, dtype: float64

### Backward fill

In [252]:
df["age"].bfill()

0    23.0
1    36.0
2    40.0
3    40.0
Name: age, dtype: float64