In [34]:
import numpy as np
import pandas as pd

In [35]:
d = {
    'A':[1, 2, 3, np.nan],
    'B':[10, np.nan, np.nan, np.nan],
    'C':[100, 200, 300, 400],
    'D':[np.nan, np.nan, np.nan, np.nan]
}

df = pd.DataFrame(d)

In [36]:
df

Unnamed: 0,A,B,C,D
0,1.0,10.0,100,
1,2.0,,200,
2,3.0,,300,
3,,,400,


In [37]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,True
1,False,True,False,True
2,False,True,False,True
3,True,True,False,True


In [38]:
df.isnull().sum()

A    1
B    3
C    0
D    4
dtype: int64

In [39]:
df.dropna()

Unnamed: 0,A,B,C,D


In [40]:
df.dropna(axis=1)

Unnamed: 0,C
0,100
1,200
2,300
3,400


In [41]:
df.dropna(axis=1, how='all')

Unnamed: 0,A,B,C
0,1.0,10.0,100
1,2.0,,200
2,3.0,,300
3,,,400


In [42]:
df.dropna(axis=1, thresh = 2)

Unnamed: 0,A,C
0,1.0,100
1,2.0,200
2,3.0,300
3,,400


In [43]:
df.fillna("FILL")

Unnamed: 0,A,B,C,D
0,1.0,10.0,100,FILL
1,2.0,FILL,200,FILL
2,3.0,FILL,300,FILL
3,FILL,FILL,400,FILL


In [44]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,10.0,100,
1,2.0,10.0,200,
2,3.0,10.0,300,
3,2.0,10.0,400,


# Grouping

In [45]:
d = {
    'Company':['FB', 'GOOGLE', 'MICROSOFT', 'GOOGLE', 'FB', 'FB'],
    'Employee':['A', 'B', 'C', 'D', 'E', 'F'],
    'Sales':[100, 200, 300, 400, 500, 600]
}

df = pd.DataFrame(d)

In [46]:
df

Unnamed: 0,Company,Employee,Sales
0,FB,A,100
1,GOOGLE,B,200
2,MICROSOFT,C,300
3,GOOGLE,D,400
4,FB,E,500
5,FB,F,600


In [47]:
df['Sales'].max()

600

In [48]:
df['Sales'].min()

100

In [49]:
df['Sales'].mean()

350.0

In [50]:
df.describe()

Unnamed: 0,Sales
count,6.0
mean,350.0
std,187.082869
min,100.0
25%,225.0
50%,350.0
75%,475.0
max,600.0


In [51]:
grouped_df = df.groupby('Company')

In [52]:
grouped_df.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,400.0
GOOGLE,300.0
MICROSOFT,300.0


In [53]:
grouped_df.max()

Unnamed: 0_level_0,Employee,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,F,600
GOOGLE,D,400
MICROSOFT,C,300


In [54]:
grouped_df.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,3.0,400.0,264.575131,100.0,300.0,500.0,550.0,600.0
GOOGLE,2.0,300.0,141.421356,200.0,250.0,300.0,350.0,400.0
MICROSOFT,1.0,300.0,,300.0,300.0,300.0,300.0,300.0


In [55]:
df['Company'].unique()

array(['FB', 'GOOGLE', 'MICROSOFT'], dtype=object)

In [56]:
df['Company'].nunique()

3

# Custom Functions

In [57]:
df

Unnamed: 0,Company,Employee,Sales
0,FB,A,100
1,GOOGLE,B,200
2,MICROSOFT,C,300
3,GOOGLE,D,400
4,FB,E,500
5,FB,F,600


In [58]:
def give_bonus(sales):
    return sales + 100

In [59]:
df['Sales'].apply(give_bonus)

0    200
1    300
2    400
3    500
4    600
5    700
Name: Sales, dtype: int64

In [60]:
df['New Sales'] = df['Sales'].apply(lambda sales : sales + 100)

In [61]:
df

Unnamed: 0,Company,Employee,Sales,New Sales
0,FB,A,100,200
1,GOOGLE,B,200,300
2,MICROSOFT,C,300,400
3,GOOGLE,D,400,500
4,FB,E,500,600
5,FB,F,600,700


In [63]:
df = pd.read_csv("data analytics/HighestGrossers.csv")

In [64]:
df.head()

Unnamed: 0,YEAR,MOVIE,GENRE,MPAA RATING,DISTRIBUTOR,TOTAL FOR YEAR,TOTAL IN 2019 DOLLARS,TICKETS SOLD
0,1995,Batman Forever,Drama,PG-13,Warner Bros.,"$184,031,112","$387,522,978",42306002
1,1996,Independence Day,Adventure,PG-13,20th Century Fox,"$306,169,255","$634,504,608",69269062
2,1997,Men in Black,Adventure,PG-13,Sony Pictures,"$250,650,052","$500,207,943",54607854
3,1998,Titanic,Adventure,PG-13,Paramount Pictures,"$443,319,081","$865,842,808",94524324
4,1999,Star Wars Ep. I: The Phantom Menace,Adventure,PG,20th Century Fox,"$430,443,350","$776,153,749",84732942


In [65]:
df.tail()

Unnamed: 0,YEAR,MOVIE,GENRE,MPAA RATING,DISTRIBUTOR,TOTAL FOR YEAR,TOTAL IN 2019 DOLLARS,TICKETS SOLD
22,2017,Star Wars Ep. VIII: The Last Jedi,Action,PG-13,Walt Disney,"$517,218,368","$528,173,936",57660910
23,2018,Black Panther,Action,PG-13,Walt Disney,"$700,059,566","$703,901,821",76845177
24,2019,Avengers: Endgame,,PG-13,Walt Disney,"$858,373,000","$858,373,002",93708843
25,2020,Bad Boys For Life,,R,Sony Pictures,"$204,417,855","$204,417,848",22316359
26,2021,Shang-Chi and the Legend of the Ten Rings,,PG-13,Walt Disney,"$224,226,704","$224,226,704",24478897


In [66]:
df['GENRE'].value_counts()

Adventure    14
Action        9
Drama         1
Name: GENRE, dtype: int64