In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

movies_raw = pd.read_csv(r"C:\Users\Kyle\OneDrive\Desktop\python\pandas_data\movie.csv") #PC 
#movies_raw = pd.read_csv('/Users/kylejohnson/Desktop/Python Files/python/Portfolio_Files/pd_cookbook_data/movie.csv') # MAC 
movies = movies_raw.copy()
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [2]:
# Assigning the columns, index, and values their own variables
columns = movies.columns
index = movies.index
data = movies.to_numpy()

In [3]:
# Understanding Data Types 
#     - Float           : The numpy float type which supports missing values 
#     - int             : The numpy integer type, which does NOT support missing values 
#     - 'Int64'         : pandas nullable integer type \
#     - object          : The numpy type for storing string and mixed types 
#     - 'category'      : pandas categorical type, which does support missing values 
#     - bool            : the numpy boolean type, which does NOT support  missing values. 
#                             - None becomes False, np.nan becomes true 
#     - 'boolean'       : pandas nullable Boolean type 
#     - datetime64[ns]  : the numpy date type, which does nsupport missing values

In [4]:
# Use dtypes to display each column and type
print(movies.dtypes)
# Value counts to return count of each value type
print(movies.dtypes.value_counts())
# Look into the info method
movies.info()

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [5]:
#Selecting a single column 

#All return the same series
#pass as a string to the index operator to select a series
movies["director_name"]
#using attribute to access the column
movies.director_name
#using the .loc method
movies.loc[:,"director_name"]
#using the .iloc method 
movies.iloc[:,1]

#Viewing the index, type, length and name of the series
#index
movies["director_name"].index
#type
movies["director_name"].dtype
#size
movies["director_name"].size
#name
movies["director_name"].name

#verify that it is a series
type(movies["director_name"])
#the types of every value for the series, unique
movies["director_name"].apply(type).unique()

array([<class 'str'>, <class 'float'>], dtype=object)

In [6]:
#Calling Series methods

#creating two different series
director = movies["director_name"]
fb_likes = movies["actor_1_facebook_likes"]

#checking dtypes
director.dtype
fb_likes.dtype

#Looking at the .head and .sample of the series
director.head(5)
fb_likes.sample(5, random_state = 42)

#Most useful for objects is value_counts (frequency)
director.value_counts()
#Can be used for numerics, can provide some insights
fb_likes.value_counts() # Rounds to nearest 1,000s

#Counting can be done with .size, .shape, len function, and .unique
director.size
director.shape
len(director)
director.unique

#.count method, does NOT return count but number of non-missing
director.count()
fb_likes.count()

#Basic summary stats, .min, .max, .mean, .median, .std
fb_likes.min() # Minimum
fb_likes.max() # Maximum
fb_likes.median() # Median
fb_likes.std() # Standard Deviation

#Describe method, both on numeric and object
fb_likes.describe()
director.describe()

#Quantiles
fb_likes.quantile(0.2) #pass scaler get scalar output
fb_likes.quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]) #input list, output is series

#Using the .isna to discover which values are missing
director.isna()
director.isnull() #the same as .isna
director.notna() #returns true all non missing values

#Using the .fillna to replace missing values
fb_likes_filled = fb_likes.fillna(0)
fb_likes_filled.count() #Now shows no missing values

#Using the .dropna method to remove the missing values
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

#Using value_counts and normalize
director.value_counts(normalize = True)

#Chacking if a series has nans with .hasnans
director.hasnans




True

In [7]:
# Series Operations

#selesct the imdb_score column
imdb_score = movies["imdb_score"]

#Use plus operator to add one to each series element
imdb_score + 1

#Other basic operations
imdb_score * 2.5 #Multiplication
imdb_score - 1 #Subtraction
imdb_score / 2.5 #Division
imdb_score ** 2 #Exponentiation

#Using // for floor division, round it down to the next lowest integer.
imdb_score // 7

#Six comparison operators, returns a boolean
imdb_score > 7 #Greater Than
imdb_score < 7 #Less Than
imdb_score >= 7 #Greater Than or Equal
imdb_score <= 7 #Less Than or Equal
director == "James Cameron" #Equal To
imdb_score != 7 #Not Equal To



#

0       True
1       True
2       True
3       True
4       True
        ... 
4911    True
4912    True
4913    True
4914    True
4915    True
Name: imdb_score, Length: 4916, dtype: bool

In [8]:
#Chaining Series Method

#New Series
fb_likes = movies["actor_1_facebook_likes"]
director = movies["director_name"]

#Most common appends are .head(), .sample(), .tail()
director.value_counts().head(3)

#Common way to count the number of missing values
fb_likes.isna().sum()

#Fill missing values with 0 and then convert to int
fb_likes.fillna(0).astype(int).head()

#Debug options for chains using the pipe function
#create a function to store values
def debug_ser(ser):
    print("BEFORE")
    print(ser)
    print("AFTER")
    return ser

fb_likes.fillna(0).pipe(debug_ser).astype(int).head()

#Creating an Intermediate variable using pipe
intermediate = None
def get_intermediate(ser):
    global intermediate
    intermediate = ser
    return ser

res = fb_likes.fillna(0).pipe(get_intermediate).astype(int).head()
intermediate

BEFORE
0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64
AFTER


0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

In [9]:
#Renaming Columns

# Renamed df method accepts dictionaries
col_map = {"director_name":"director",
           "num_critic_for_reviews":"crtitic_reviews"}
#pass the dictionary to rename method
movies.rename(columns = col_map).head()

#Renaming within the index
idx_mapping = {"Avatar":"Ratava",
               "Spectre":"Ertceps",
               "Pirates of the Caribbean: At World's End":"POC"}
col_map = {"aspect_ratio":"aspect",
           "movie_facebook_likes":"fblikes"}
movies.set_index("movie_title").rename(index=idx_mapping, columns = col_map).head(3)

#using the .tolist method
movies = movies.copy()
ids = movies.index.to_list()
columns = movies.columns.to_list()
#rename the row and column labels with list assignments
ids[0] = "Ratava"
ids[1] = "POC"
ids[2] = "Ertceps"
columns[1] = "director"
columns[-2] = "aspect"
columns[-1] = "fblikes"
movies.index = ids
movies.columns = columns
movies.head(3)

#using a function to replace spaces with _ 
def to_clean(val):
    return val.strip().lower().replace(" ","_")
movies.rename(columns = to_clean).head(3)

#assuming spaces and uppercases in the columns, another way to clean
cols = [
    col.strip().lower().replace(" ","_")
    for col in movies.columns
]
movies.columns = cols
movies.head(3)



Unnamed: 0,color,director,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [None]:

#creating and deleting columns 

movies = movies_raw.copy()

#Create using the index assignment
movies['has_seen'] = 0

movies = movies_raw.copy()

#Using the .assign method w/ chain
idx_map = {"Avatar":"Ratava",
               "Spectre":"Ertceps",
               "Pirates of the Caribbean: At World's End":"POC"}

col_map = {
    "aspect_ratio" : "aspect",
    "movie_facebook_likes":"fblikes"
}

movies.rename(index = idx_map, 
              columns = col_map).assign(has_seen=0)


#Calculating total likes across all

total = (movies["actor_1_facebook_likes"] + 
        movies["actor_2_facebook_likes"] + 
        movies["actor_3_facebook_likes"] +
        movies["director_facebook_likes"])

cols = [ "actor_1_facebook_likes",
        "actor_2_facebook_likes",
        "actor_3_facebook_likes",
        "director_facebook_likes"]

sum_col = movies.loc[:,cols].sum(axis = "columns")
sum_col.head(5)


movies.assign(total_likes=sum_col).head()

#pass function paramerter in assign method 
def sum_likes(df):
    return df[
        [
                c
                for c in df.columns
                if "like" in c
                and ("actor" in c or "director" in c)
        ]
    ].sum(axis = 1)

movies.assign(total_likes = sum_likes) 

#Check if missing values are in new column 

movies.assign(total_likes = sum_col)['total_likes'].isna().sum() #With the function method
movies.assign(total_likes = total)['total_likes'].isna().sum() #with the total method

#Filling in missing values with 0 
movies.assign(total_likes = total.fillna(0))['total_likes'].isna().sum()

#Data validation - cast_total_facbook likes >= total likes
def cast_like_gt_actor(df):
    return(
        df["cast_total_facebook_likes"] >= df["total_likes"]
    )

df2 = movies.assign(
    total_likes = total, 
    is_cast_likes_more = cast_like_gt_actor,
)

#we can check whether all columns of is cast likes more 
df2["is_cast_likes_more"].all()

#back track and delete total_likes using .drop method
df2 = df2.drop(columns = "total_likes")

#recreate series of just total actor likes
actor_sum = movies[
    [
        c
        for c in movies.columns
        if "actor_" in c and "_likes" in c
    ]
].sum(axis = "columns")

actor_sum.head(5)

#Check if there is false now
movies["cast_total_facebook_likes"] >= actor_sum
#Check if there is false now using .ge method
movies["cast_total_facebook_likes"].ge(actor_sum).all()

#Calculate the percentage of likes that come from actor_sum
pct_like = actor_sum.div(movies["cast_total_facebook_likes"]).mul(100)

#validate the min and max of series between 0 & 1
pct_like.describe()

#creating a series using movie title as index

pd.Series(pct_like.to_numpy(), index = movies["movie_title"]).head()



movie_title
Avatar                                         57.736864
Pirates of the Caribbean: At World's End       95.139607
Spectre                                        98.752137
The Dark Knight Rises                          68.378310
Star Wars: Episode VII - The Force Awakens    100.000000
dtype: float64