In [1]:
import pandas as pd

import numpy as np

In [2]:
# Read data from the url

url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv'
tips = pd.read_csv(url)

In [6]:
tips[10:].head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
10,10.27,1.71,Male,No,Sun,Dinner,2
11,35.26,5.0,Female,No,Sun,Dinner,4
12,15.42,1.57,Male,No,Sun,Dinner,2
13,18.43,3.0,Male,No,Sun,Dinner,4
14,14.83,3.02,Female,No,Sun,Dinner,2
15,21.58,3.92,Male,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
17,16.29,3.71,Male,No,Sun,Dinner,3
18,16.97,3.5,Female,No,Sun,Dinner,3
19,20.65,3.35,Male,No,Sat,Dinner,3


**SQL Vs Python** 

_The below cells will display the logic to write the python equivalent to perform the SQL operations_



**Select Condition**  

In [4]:
#SQL
#SELECT * FROM tips
#SELECT total_bill, tip, smoker, time FROM tips


In [4]:
#Python
tips # Displays all the columns & records
tips[['total_bill', 'tip', 'smoker', 'time']].head(5) #Filter specific columns


Unnamed: 0,total_bill,tip,smoker,time
0,16.99,1.01,No,Dinner
1,10.34,1.66,No,Dinner
2,21.01,3.5,No,Dinner
3,23.68,3.31,No,Dinner
4,24.59,3.61,No,Dinner


**Where Condition**  

In [15]:
#SQL
#SELECT * FROM tips WHERE time = 'Dinner'


In [7]:
#Python
tips[tips['time'] == 'Dinner'].head(5) #Filtering records during Dinner time 

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


**using | (OR) and & (AND)**  

In [None]:
#SQL
#SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;

In [19]:
#Python
# tips of more than $5.00 at Dinner meals
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
23,39.42,7.58,Male,No,Sat,Dinner,4
44,30.4,5.6,Male,No,Sun,Dinner,4
47,32.4,6.0,Male,No,Sun,Dinner,4
52,34.81,5.2,Female,No,Sun,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
116,29.93,5.07,Male,No,Sun,Dinner,4
155,29.85,5.14,Female,No,Sun,Dinner,5
170,50.81,10.0,Male,Yes,Sat,Dinner,3
172,7.25,5.15,Male,Yes,Sun,Dinner,2
181,23.33,5.65,Male,Yes,Sun,Dinner,2


In [20]:
#SQL
#tips by parties of at least 5 diners OR bill total was more than $45
#SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;

In [15]:
#Python
# tips by parties of at least 5 diners OR bill total was more than $45
tips[(tips['size'] >= 5) & (tips['total_bill'] > 45)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
156,48.17,5.0,Male,No,Sun,Dinner,6


**NULL checking**  

In [28]:
#SQL
#SELECT * FROM tips WHERE tip IS NULL;

#Python
tips[tips['tip'].isnull()]   # Displays records where tip value is NULL
tips[tips['tip'].notnull()]  # Displays records where tip value is NOT NULL


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


**GROUP BY Condition**  

In [42]:
#SQL
#SELECT sex, count(*) FROM tips GROUP BY sex

In [37]:
#Python
tips.groupby('sex').size() # Displays count of records in Male and Female   
tips.groupby('sex').mean() # Displays mean of all the integer Columns in Male and Female     

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,18.056897,2.833448,2.45977
Male,20.744076,3.089618,2.630573


_Notice that in the pandas code we used size() and not count(). This is because count() applies the function to each column, returning the number of not null records within each_

In [74]:
tips.groupby('sex').count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,87,87,87,87,87,87
Male,157,157,157,157,157,157


_Multiple functions can also be applied at once. For instance, say we’d like to see how tip amount differs by day of the week - agg() allows you to pass a dictionary to your grouped DataFrame, indicating which functions to apply to specific columns_

In [85]:
#SQL - SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})



Unnamed: 0_level_0,tip,day
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,2.734737,19
Sat,2.993103,87
Sun,3.255132,76
Thur,2.771452,62


_Grouping by more than one column is done by passing a list of columns to the groupby() method_

In [86]:
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2
No,Fri,4.0,2.8125
No,Sat,45.0,3.102889
No,Sun,57.0,3.167895
No,Thur,45.0,2.673778
Yes,Fri,15.0,2.714
Yes,Sat,42.0,2.875476
Yes,Sun,19.0,3.516842
Yes,Thur,17.0,3.03


**JOIN Condition**

_JOINs can be performed with join() or merge(). By default, join() will join the DataFrames on their indices. Each method has parameters allowing you to specify the type of join to perform (LEFT, RIGHT, INNER, FULL) or the columns to join on (column names or indices)_

In [59]:
import pandas as pd

In [60]:
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
   'value_1': np.random.randint(1,50,4)})

df2 = pd.DataFrame({'key': ['D', 'E', 'F'],
   'value_2': np.random.randint(51,99,3)})

In [61]:
df1

Unnamed: 0,key,value_1
0,A,25
1,B,46
2,C,30
3,D,26


In [62]:
df2

Unnamed: 0,key,value_2
0,D,77
1,E,85
2,F,73


In [41]:
#SQL
#SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;

#Python
pd.merge(df1, df2, on='key') # Inner join df1 & df2 on Key Column

Unnamed: 0,key,value_1,value_2
0,D,44,80


In [42]:
pd.merge(df1, df2, how='right', on='key' )


Unnamed: 0,key,value_1,value_2
0,D,44.0,80
1,E,,58
2,F,,68


_merge() also offers parameters for cases when you’d like to join one DataFrame’s column with another DataFrame’s index_


**DIFFERENT JOINS**

_Left Join_                                                                                                                     
_Right Join_                                                                                                                     
_Full Join_                                                                                                                     

_By Changing the how parameter of a merge function, different types of joins can be performed_ 

In [43]:
#show all records from df1
#SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key

pd.merge(df1, df2, how='left',on='key')
pd.merge(df1, df2 ,how='right', on='key')
missing_values=pd.merge(df1, df2 ,how='outer', on='key')



In [69]:
missing_values=pd.merge(df1, df2 ,how='outer', on='key')


In [70]:
missing_values

Unnamed: 0,key,value_1,value_2
0,A,25.0,
1,B,46.0,
2,C,30.0,
3,D,26.0,77.0
4,E,,85.0
5,F,,73.0


In [75]:
missing_values.dropna(subset=['value_1'])

Unnamed: 0,key,value_1,value_2
0,A,25.0,
1,B,46.0,
2,C,30.0,
3,D,26.0,77.0


In [67]:
missing_values.dropna(inplace=True)

## Handling missing values

In [44]:
missing_values

Unnamed: 0,key,value_1,value_2
0,A,1.0,
1,B,24.0,
2,C,41.0,
3,D,44.0,80.0
4,E,,58.0
5,F,,68.0


In [46]:
import numpy as np

In [37]:
missing_values.isnull()

Unnamed: 0,key,value_1,value_2
0,False,False,True
1,False,False,True
2,False,False,True
3,False,False,False
4,False,True,False
5,False,True,False


In [48]:
missing_values.fillna(10)

Unnamed: 0,key,value_1,value_2
0,A,1.0,10.0
1,B,24.0,10.0
2,C,41.0,10.0
3,D,44.0,80.0
4,E,10.0,58.0
5,F,10.0,68.0


In [45]:
missing_values.dropna()

Unnamed: 0,key,value_1,value_2
3,D,44.0,80.0


**UNION ALL**



In [49]:
df1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'],
'rank': range(1, 4)})
 

df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'],
'rank': [1, 4, 5]})

In [52]:
df1

Unnamed: 0,city,rank
0,Chicago,1
1,San Francisco,2
2,New York City,3


In [53]:
df2

Unnamed: 0,city,rank
0,Chicago,1
1,Boston,4
2,Los Angeles,5


In [54]:
#SQL
#SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;

#Python
pd.concat([df1, df2])

Unnamed: 0,city,rank
0,Chicago,1
1,San Francisco,2
2,New York City,3
0,Chicago,1
1,Boston,4
2,Los Angeles,5


**UNION**



In [136]:
#SQL
#SELECT city, rank FROM df1 UNION SELECT city, rank FROM df2;

#Python
pd.concat([df1, df2]).drop_duplicates()

Unnamed: 0,city,rank
0,Chicago,1
1,San Francisco,2
2,New York City,3
1,Boston,4
2,Los Angeles,5


**UPDATE**




In [167]:
#SQL
#UPDATE tips SET tip = tip*2 WHERE tip < 2;

In [77]:
#Python
#tips.loc
#[tips['tip'] < 2, 'tip']

##[tips['tip'] < 2]

#*= 2

In [80]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [168]:
#Python
tips.loc[tips['tip'] < 2, 'tip'] *= 2

In [39]:
#Python
tips.loc[tips['tip'] < 2] 
#=tip* 2

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
8,15.04,1.96,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
12,15.42,1.57,Male,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
30,9.55,1.45,Male,No,Sat,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
53,9.94,1.56,Male,No,Sun,Dinner,2
57,26.41,1.5,Female,No,Sat,Dinner,2


**DELETE**

_In pandas we select the rows that should remain, instead of deleting them_



In [166]:
#SQL
#DELETE FROM tips WHERE tip > 9;

In [170]:
#Python
#tips = tips.loc[tips['tip'] <= 9]
tips=tips[tips['tip'] <= 9]


In [6]:
#Difference between
tips.groupby('smoker')['total_bill'].mean()
#stips.groupby('smoker')['total_bill'].mean

smoker
No     19.188278
Yes    20.756344
Name: total_bill, dtype: float64