In [1]:
#pip install pandas

In [2]:
import pandas as pd
import numpy as np

# Series

In [3]:
myList = [1, 2, 3, 4]
ser1 = pd.Series(myList)
ser1

0    1
1    2
2    3
3    4
dtype: int64

In [4]:
ser1[1]

2

In [5]:
myIndex = ['a', 'b', 'c', 'd']
ser2 = pd.Series(data=myList, index=myIndex)
ser2

a    1
b    2
c    3
d    4
dtype: int64

In [6]:
ser2['a']

1

In [7]:
ser2[0]

1

# DataFrame

In [8]:
df = pd.DataFrame(data=np.random.randint(1, 100, (4, 5)), index=['A', 'B', 'C', 'D'], 
             columns=['V', 'W', 'X', 'Y', 'Z'])

df

Unnamed: 0,V,W,X,Y,Z
A,23,71,68,77,93
B,79,50,62,92,94
C,80,31,35,67,34
D,94,57,90,35,88


In [9]:
df['W'] #To grab a column

A    71
B    50
C    31
D    57
Name: W, dtype: int64

In [10]:
df['W']['C']

31

In [11]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,71,93
B,50,94
C,31,34
D,57,88


In [12]:
df.loc['A'] #To grab a row

V    23
W    71
X    68
Y    77
Z    93
Name: A, dtype: int64

In [13]:
df['New Column'] = [10, 20, 30, 40]
df

Unnamed: 0,V,W,X,Y,Z,New Column
A,23,71,68,77,93,10
B,79,50,62,92,94,20
C,80,31,35,67,34,30
D,94,57,90,35,88,40


In [14]:
df.shape

(4, 6)

In [15]:
df.drop('New Column', axis=1)

Unnamed: 0,V,W,X,Y,Z
A,23,71,68,77,93
B,79,50,62,92,94
C,80,31,35,67,34
D,94,57,90,35,88


In [16]:
df

Unnamed: 0,V,W,X,Y,Z,New Column
A,23,71,68,77,93,10
B,79,50,62,92,94,20
C,80,31,35,67,34,30
D,94,57,90,35,88,40


In [17]:
df.drop('New Column', axis=1, inplace=True)

In [18]:
df

Unnamed: 0,V,W,X,Y,Z
A,23,71,68,77,93
B,79,50,62,92,94
C,80,31,35,67,34
D,94,57,90,35,88


In [19]:
df.drop('D')

Unnamed: 0,V,W,X,Y,Z
A,23,71,68,77,93
B,79,50,62,92,94
C,80,31,35,67,34


In [20]:
df['W'] % 2 == 0

A    False
B     True
C    False
D    False
Name: W, dtype: bool

In [21]:
df[df['W'] % 2 == 0]

Unnamed: 0,V,W,X,Y,Z
B,79,50,62,92,94


# Reading a File

In [22]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [23]:
df['BMI']

0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64

# Missing Values

In [24]:
data = {'A':[1, 2, np.nan], 'B':[5, np.nan, np.nan], 'C':[1, 2, 3]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [25]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [26]:
df.isnull().sum()

A    1
B    2
C    0
dtype: int64

In [27]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [28]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [29]:
df.fillna(value="FILL")

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,FILL,2
2,FILL,FILL,3


In [30]:
df.mean()

A    1.5
B    5.0
C    2.0
dtype: float64

In [31]:
df.fillna(value=df.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


# GroupBy

In [33]:
data = {'Company':['GOOGLE', 'FACEBOOK', 'GOOGLE', 'MICROSOFT', 'FACEBOOK', 'MICROSOFT'],
        'Employee':['Sam', 'Jack', 'Amy', 'Rachel', 'Eric', 'Wanda'],
        'Sales':[100, 200, 450, 400, 500, 300]}

df = pd.DataFrame(data)
df

Unnamed: 0,Company,Employee,Sales
0,GOOGLE,Sam,100
1,FACEBOOK,Jack,200
2,GOOGLE,Amy,450
3,MICROSOFT,Rachel,400
4,FACEBOOK,Eric,500
5,MICROSOFT,Wanda,300


In [34]:
df.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc711c0c2e0>

In [35]:
df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FACEBOOK,2.0,350.0,212.132034,200.0,275.0,350.0,425.0,500.0
GOOGLE,2.0,275.0,247.487373,100.0,187.5,275.0,362.5,450.0
MICROSOFT,2.0,350.0,70.710678,300.0,325.0,350.0,375.0,400.0


In [36]:
df.describe()

Unnamed: 0,Sales
count,6.0
mean,325.0
std,154.11035
min,100.0
25%,225.0
50%,350.0
75%,437.5
max,500.0


In [37]:
df

Unnamed: 0,Company,Employee,Sales
0,GOOGLE,Sam,100
1,FACEBOOK,Jack,200
2,GOOGLE,Amy,450
3,MICROSOFT,Rachel,400
4,FACEBOOK,Eric,500
5,MICROSOFT,Wanda,300


In [38]:
len(df['Employee'])

6

In [39]:
df['Employee'].apply(len)

0    3
1    4
2    3
3    6
4    4
5    5
Name: Employee, dtype: int64

In [41]:
def times10(x):
    return x * 10

In [42]:
df['Sales'].apply(times10)

0    1000
1    2000
2    4500
3    4000
4    5000
5    3000
Name: Sales, dtype: int64

In [43]:
lambda x : x * 10

<function __main__.<lambda>(x)>

In [44]:
df['Sales'].apply(lambda x : x * 10)

0    1000
1    2000
2    4500
3    4000
4    5000
5    3000
Name: Sales, dtype: int64

# Miscellanous Functions

In [45]:
df

Unnamed: 0,Company,Employee,Sales
0,GOOGLE,Sam,100
1,FACEBOOK,Jack,200
2,GOOGLE,Amy,450
3,MICROSOFT,Rachel,400
4,FACEBOOK,Eric,500
5,MICROSOFT,Wanda,300


In [46]:
df['Company'].unique()

array(['GOOGLE', 'FACEBOOK', 'MICROSOFT'], dtype=object)

In [47]:
df['Company'].nunique()

3

In [48]:
df['Company'].value_counts()

GOOGLE       2
MICROSOFT    2
FACEBOOK     2
Name: Company, dtype: int64

In [50]:
df.sort_values('Sales')

Unnamed: 0,Company,Employee,Sales
0,GOOGLE,Sam,100
1,FACEBOOK,Jack,200
5,MICROSOFT,Wanda,300
3,MICROSOFT,Rachel,400
2,GOOGLE,Amy,450
4,FACEBOOK,Eric,500
